[x265] [PATCH] asm: avx2 version cvt16to32_shr[]
Min Chen
chenm003 at 163.com
Wed Aug 27 21:16:26 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1409166976 25200
# Node ID 6cdcf1a7fa9803898e8f04818865cc150db250ea
# Parent 77fe0cc583e8ec10275bc1b3c4bb116d5ceb51ac
asm: avx2 version cvt16to32_shr[]
4x4 135c -> 105c
8x8 375c -> 233c (unroll 228c)
16x16 1333c -> 816c
32x32 5278c -> 2690c
diff -r 77fe0cc583e8 -r 6cdcf1a7fa98 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Aug 27 14:25:17 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Aug 27 12:16:16 2014 -0700
@@ -1714,6 +1714,10 @@
p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
+ p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_avx2;
+ p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_avx2;
+ p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_avx2;
+ p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_avx2;
p.denoiseDct = x265_denoise_dct_avx2;
}
#endif // if HIGH_BIT_DEPTH
diff -r 77fe0cc583e8 -r 6cdcf1a7fa98 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Wed Aug 27 14:25:17 2014 +0530
+++ b/source/common/x86/blockcopy8.asm Wed Aug 27 12:16:16 2014 -0700
@@ -3437,6 +3437,38 @@
RET
+INIT_YMM avx2
+cglobal cvt16to32_shr_4, 3,3,4
+ add r2d, r2d
+ movd xm0, r3m
+ vpbroadcastd m1, r4m
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - stride
+ ; m0 - shift
+ ; m1 - dword [offset]
+
+ ; Row 0-1
+ pmovsxwd xm2, [r1]
+ pmovsxwd xm3, [r1 + r2]
+ vinserti128 m2, m2, xm3, 1
+ paddd m2, m1
+ psrad m2, xm0
+ movu [r0 + 0 * mmsize], m2
+
+ ; Row 2-3
+ lea r1, [r1 + r2 * 2]
+ pmovsxwd xm2, [r1]
+ pmovsxwd xm3, [r1 + r2]
+ vinserti128 m2, m2, xm3, 1
+ paddd m2, m1
+ psrad m2, xm0
+ movu [r0 + 1 * mmsize], m2
+ RET
+
+
;--------------------------------------------------------------------------------------
; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
;--------------------------------------------------------------------------------------
@@ -3506,6 +3538,55 @@
RET
+INIT_YMM avx2
+cglobal cvt16to32_shr_8, 3,5,3
+ add r2d, r2d
+ movd xm0, r3m
+ vpbroadcastd m1, r4m
+ mov r3d, 8/4
+ lea r4, [r2 * 3]
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - stride
+ ; r3 - loop counter
+ ; r4 - stride * 3
+ ; m0 - shift
+ ; m1 - dword [offset]
+
+.loop:
+ ; Row 0
+ pmovsxwd m2, [r1]
+ paddd m2, m1
+ psrad m2, xm0
+ movu [r0 + 0 * mmsize], m2
+
+ ; Row 1
+ pmovsxwd m2, [r1 + r2]
+ paddd m2, m1
+ psrad m2, xm0
+ movu [r0 + 1 * mmsize], m2
+
+ ; Row 2
+ pmovsxwd m2, [r1 + r2 * 2]
+ paddd m2, m1
+ psrad m2, xm0
+ movu [r0 + 2 * mmsize], m2
+
+ ; Row 3
+ pmovsxwd m2, [r1 + r4]
+ paddd m2, m1
+ psrad m2, xm0
+ movu [r0 + 3 * mmsize], m2
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
+
+
;--------------------------------------------------------------------------------------
; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
;--------------------------------------------------------------------------------------
@@ -3569,6 +3650,72 @@
RET
+INIT_YMM avx2
+cglobal cvt16to32_shr_16, 3,5,4
+ add r2d, r2d
+ movd xm0, r3m
+ vpbroadcastd m1, r4m
+ mov r3d, 16/4
+ lea r4, [r2 * 3]
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - stride
+ ; r3 - loop counter
+ ; m0 - shift
+ ; m1 - dword [offset]
+
+.loop:
+ ; Row 0
+ pmovsxwd m2, [r1 + 0 * mmsize/2]
+ pmovsxwd m3, [r1 + 1 * mmsize/2]
+ paddd m2, m1
+ paddd m3, m1
+ psrad m2, xm0
+ psrad m3, xm0
+ movu [r0 + 0 * mmsize], m2
+ movu [r0 + 1 * mmsize], m3
+
+ ; Row 1
+ pmovsxwd m2, [r1 + r2 + 0 * mmsize/2]
+ pmovsxwd m3, [r1 + r2 + 1 * mmsize/2]
+ paddd m2, m1
+ paddd m3, m1
+ psrad m2, xm0
+ psrad m3, xm0
+ movu [r0 + 2 * mmsize], m2
+ movu [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
+
+ ; Row 2
+ pmovsxwd m2, [r1 + r2 * 2 + 0 * mmsize/2]
+ pmovsxwd m3, [r1 + r2 * 2 + 1 * mmsize/2]
+ paddd m2, m1
+ paddd m3, m1
+ psrad m2, xm0
+ psrad m3, xm0
+ movu [r0 + 0 * mmsize], m2
+ movu [r0 + 1 * mmsize], m3
+
+ ; Row 3
+ pmovsxwd m2, [r1 + r4 + 0 * mmsize/2]
+ pmovsxwd m3, [r1 + r4 + 1 * mmsize/2]
+ paddd m2, m1
+ paddd m3, m1
+ psrad m2, xm0
+ psrad m3, xm0
+ movu [r0 + 2 * mmsize], m2
+ movu [r0 + 3 * mmsize], m3
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+ dec r3d
+ jnz .loop
+ RET
+
+
;--------------------------------------------------------------------------------------
; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
;--------------------------------------------------------------------------------------
@@ -3631,6 +3778,66 @@
RET
+INIT_YMM avx2
+cglobal cvt16to32_shr_32, 3,4,6
+ add r2d, r2d
+ movd xm0, r3m
+ vpbroadcastd m1, r4m
+ mov r3d, 32/2
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - stride
+ ; r3 - loop counter
+ ; m0 - shift
+ ; m1 - dword [offset]
+
+.loop:
+ ; Row 0
+ pmovsxwd m2, [r1 + 0 * mmsize/2]
+ pmovsxwd m3, [r1 + 1 * mmsize/2]
+ pmovsxwd m4, [r1 + 2 * mmsize/2]
+ pmovsxwd m5, [r1 + 3 * mmsize/2]
+ paddd m2, m1
+ paddd m3, m1
+ paddd m4, m1
+ paddd m5, m1
+ psrad m2, xm0
+ psrad m3, xm0
+ psrad m4, xm0
+ psrad m5, xm0
+ movu [r0 + 0 * mmsize], m2
+ movu [r0 + 1 * mmsize], m3
+ movu [r0 + 2 * mmsize], m4
+ movu [r0 + 3 * mmsize], m5
+ add r0, 4 * mmsize
+
+ ; Row 1
+ pmovsxwd m2, [r1 + r2 + 0 * mmsize/2]
+ pmovsxwd m3, [r1 + r2 + 1 * mmsize/2]
+ pmovsxwd m4, [r1 + r2 + 2 * mmsize/2]
+ pmovsxwd m5, [r1 + r2 + 3 * mmsize/2]
+ paddd m2, m1
+ paddd m3, m1
+ paddd m4, m1
+ paddd m5, m1
+ psrad m2, xm0
+ psrad m3, xm0
+ psrad m4, xm0
+ psrad m5, xm0
+ movu [r0 + 0 * mmsize], m2
+ movu [r0 + 1 * mmsize], m3
+ movu [r0 + 2 * mmsize], m4
+ movu [r0 + 3 * mmsize], m5
+ add r0, 4 * mmsize
+
+ lea r1, [r1 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
+
+
;--------------------------------------------------------------------------------------
; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
;--------------------------------------------------------------------------------------
diff -r 77fe0cc583e8 -r 6cdcf1a7fa98 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Wed Aug 27 14:25:17 2014 +0530
+++ b/source/common/x86/blockcopy8.h Wed Aug 27 12:16:16 2014 -0700
@@ -38,6 +38,10 @@
void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
+void x265_cvt16to32_shr_4_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
+void x265_cvt16to32_shr_8_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
+void x265_cvt16to32_shr_16_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
+void x265_cvt16to32_shr_32_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
uint32_t x265_cvt16to32_cnt_4_sse4(int32_t * dst, int16_t * src, intptr_t);
uint32_t x265_cvt16to32_cnt_8_sse4(int32_t * dst, int16_t * src, intptr_t);
uint32_t x265_cvt16to32_cnt_16_sse4(int32_t * dst, int16_t * src, intptr_t);
More information about the x265-devel
mailing list