[x265] [PATCH] asm: avx2 10bit code for sub_ps[16x16], [32x32], [64x64]
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Thu Apr 23 15:52:21 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1429796254 -19800
# Thu Apr 23 19:07:34 2015 +0530
# Node ID cd0c8df9e9bb9e8ceebfa84532e6dca8d50916ee
# Parent cec68d3e37ef15c571cfa7f2784a12e944a2e2a7
asm: avx2 10bit code for sub_ps[16x16],[32x32],[64x64]
sub_ps[16x16](13.23x), sub_ps[32x32](16.41x), sub_ps[64x64](16.96x)
diff -r cec68d3e37ef -r cd0c8df9e9bb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 22 21:35:55 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Apr 23 19:07:34 2015 +0530
@@ -1223,6 +1223,10 @@
ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
+ p.cu[BLOCK_16x16].sub_ps = x265_pixel_sub_ps_16x16_avx2;
+ p.cu[BLOCK_32x32].sub_ps = x265_pixel_sub_ps_32x32_avx2;
+ p.cu[BLOCK_64x64].sub_ps = x265_pixel_sub_ps_64x64_avx2;
+
p.pu[LUMA_16x4].convert_p2s = x265_filterPixelToShort_16x4_avx2;
p.pu[LUMA_16x8].convert_p2s = x265_filterPixelToShort_16x8_avx2;
p.pu[LUMA_16x12].convert_p2s = x265_filterPixelToShort_16x12_avx2;
diff -r cec68d3e37ef -r cd0c8df9e9bb source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Apr 22 21:35:55 2015 -0500
+++ b/source/common/x86/pixel-util8.asm Thu Apr 23 19:07:34 2015 +0530
@@ -4560,6 +4560,54 @@
;-----------------------------------------------------------------------------
; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%macro PIXELSUB_PS_W16_H4_avx2 1
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_sub_ps_16x%1, 6, 9, 4, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1d, r1d
+ add r4d, r4d
+ add r5d, r5d
+ lea r6, [r1 * 3]
+ lea r7, [r4 * 3]
+ lea r8, [r5 * 3]
+
+%rep %1/4
+ movu m0, [r2]
+ movu m1, [r3]
+ movu m2, [r2 + r4]
+ movu m3, [r3 + r5]
+
+ psubw m0, m1
+ psubw m2, m3
+
+ movu [r0], m0
+ movu [r0 + r1], m2
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r3 + r5 * 2]
+ movu m2, [r2 + r7]
+ movu m3, [r3 + r8]
+
+ psubw m0, m1
+ psubw m2, m3
+
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r6], m2
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+%endrep
+ RET
+%endif
+%endmacro
+PIXELSUB_PS_W16_H4_avx2 16
+PIXELSUB_PS_W16_H4_avx2 32
+%else
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W16_H8_avx2 2
%if ARCH_X86_64
INIT_YMM avx2
@@ -4632,6 +4680,7 @@
PIXELSUB_PS_W16_H8_avx2 16, 16
PIXELSUB_PS_W16_H8_avx2 16, 32
+%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
@@ -4770,6 +4819,74 @@
;-----------------------------------------------------------------------------
; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%macro PIXELSUB_PS_W32_H4_avx2 1
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_sub_ps_32x%1, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1d, r1d
+ add r4d, r4d
+ add r5d, r5d
+ mov r9d, %1/4
+ lea r6, [r1 * 3]
+ lea r7, [r4 * 3]
+ lea r8, [r5 * 3]
+
+.loop
+ movu m0, [r2]
+ movu m1, [r2 + 32]
+ movu m2, [r3]
+ movu m3, [r3 + 32]
+ psubw m0, m2
+ psubw m1, m3
+
+ movu [r0], m0
+ movu [r0 + 32], m1
+
+ movu m0, [r2 + r4]
+ movu m1, [r2 + r4 + 32]
+ movu m2, [r3 + r5]
+ movu m3, [r3 + r5 + 32]
+ psubw m0, m2
+ psubw m1, m3
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 32], m1
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r2 + r4 * 2 + 32]
+ movu m2, [r3 + r5 * 2]
+ movu m3, [r3 + r5 * 2 + 32]
+ psubw m0, m2
+ psubw m1, m3
+
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r1 * 2 + 32], m1
+
+ movu m0, [r2 + r7]
+ movu m1, [r2 + r7 + 32]
+ movu m2, [r3 + r8]
+ movu m3, [r3 + r8 + 32]
+ psubw m0, m2
+ psubw m1, m3
+
+ movu [r0 + r6], m0
+ movu [r0 + r6 + 32], m1
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ dec r9d
+ jnz .loop
+ RET
+%endif
+%endmacro
+PIXELSUB_PS_W32_H4_avx2 32
+PIXELSUB_PS_W32_H4_avx2 64
+%else
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W32_H8_avx2 2
%if ARCH_X86_64
INIT_YMM avx2
@@ -4886,6 +5003,7 @@
PIXELSUB_PS_W32_H8_avx2 32, 32
PIXELSUB_PS_W32_H8_avx2 32, 64
+%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
@@ -5109,6 +5227,102 @@
;-----------------------------------------------------------------------------
; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_sub_ps_64x64, 6, 10, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1d, r1d
+ add r4d, r4d
+ add r5d, r5d
+ mov r9d, 16
+ lea r6, [r1 * 3]
+ lea r7, [r4 * 3]
+ lea r8, [r5 * 3]
+
+.loop
+ movu m0, [r2]
+ movu m1, [r2 + 32]
+ movu m2, [r2 + 64]
+ movu m3, [r2 + 96]
+ movu m4, [r3]
+ movu m5, [r3 + 32]
+ movu m6, [r3 + 64]
+ movu m7, [r3 + 96]
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r0], m0
+ movu [r0 + 32], m1
+ movu [r0 + 64], m2
+ movu [r0 + 96], m3
+
+ movu m0, [r2 + r4]
+ movu m1, [r2 + r4 + 32]
+ movu m2, [r2 + r4 + 64]
+ movu m3, [r2 + r4 + 96]
+ movu m4, [r3 + r5]
+ movu m5, [r3 + r5 + 32]
+ movu m6, [r3 + r5 + 64]
+ movu m7, [r3 + r5 + 96]
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 32], m1
+ movu [r0 + r1 + 64], m2
+ movu [r0 + r1 + 96], m3
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r2 + r4 * 2 + 32]
+ movu m2, [r2 + r4 * 2 + 64]
+ movu m3, [r2 + r4 * 2 + 96]
+ movu m4, [r3 + r5 * 2]
+ movu m5, [r3 + r5 * 2 + 32]
+ movu m6, [r3 + r5 * 2 + 64]
+ movu m7, [r3 + r5 * 2 + 96]
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r1 * 2 + 32], m1
+ movu [r0 + r1 * 2 + 64], m2
+ movu [r0 + r1 * 2 + 96], m3
+
+ movu m0, [r2 + r7]
+ movu m1, [r2 + r7 + 32]
+ movu m2, [r2 + r7 + 64]
+ movu m3, [r2 + r7 + 96]
+ movu m4, [r3 + r8]
+ movu m5, [r3 + r8 + 32]
+ movu m6, [r3 + r8 + 64]
+ movu m7, [r3 + r8 + 96]
+ psubw m0, m4
+ psubw m1, m5
+ psubw m2, m6
+ psubw m3, m7
+
+ movu [r0 + r6], m0
+ movu [r0 + r6 + 32], m1
+ movu [r0 + r6 + 64], m2
+ movu [r0 + r6 + 96], m3
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ dec r9d
+ jnz .loop
+ RET
+%endif
+%else
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
INIT_YMM avx2
cglobal pixel_sub_ps_64x64, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
mov r6d, 16
@@ -5214,6 +5428,7 @@
dec r6d
jnz .loop
RET
+%endif
;=============================================================================
; variance
;=============================================================================
More information about the x265-devel
mailing list