[x265] [PATCH] asm: 10bpp code of pixel_sub for 16xN, 24x32, 32xN, 48x64 and 64xN

murugan at multicorewareinc.com murugan at multicorewareinc.com
Fri Dec 6 13:53:54 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386334392 -19800
#      Fri Dec 06 18:23:12 2013 +0530
# Node ID 9a9eaf4dadd7e191038e56341fac23aad60a10db
# Parent  6ed44381cb6c2fab0bc550170e5bfacc68aaaf4e
asm: 10bpp code of pixel_sub for 16xN, 24x32, 32xN,48x64 and 64xN

diff -r 6ed44381cb6c -r 9a9eaf4dadd7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Dec 05 16:05:53 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp	Fri Dec 06 18:23:12 2013 +0530
@@ -300,9 +300,11 @@
     p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
     p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
     p.luma_copy_ps[LUMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu; \
-    p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
     p.luma_add_ps[LUMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
 
+#define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \
+    p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;
+
 #define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
     p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
 
@@ -398,6 +400,33 @@
     SETUP_LUMA_FUNC_DEF(64, 16, cpu); \
     SETUP_LUMA_FUNC_DEF(16, 64, cpu);
 
+#define LUMA_PIXELSUB(cpu) \
+    SETUP_LUMA_SUB_FUNC_DEF(4,   4, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(8,   8, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(8,   4, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(4,   8, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(16, 16, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(16,  8, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(8,  16, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(16, 12, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(12, 16, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(16,  4, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(4,  16, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(32, 32, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(32, 16, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(16, 32, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(32, 24, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(24, 32, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(32,  8, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(8,  32, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(64, 64, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(64, 32, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(32, 64, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(64, 48, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(48, 64, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(64, 16, cpu); \
+    SETUP_LUMA_SUB_FUNC_DEF(16, 64, cpu);
+
 #define LUMA_SP_FILTERS(cpu) \
     SETUP_LUMA_SP_FUNC_DEF(4,   4, cpu); \
     SETUP_LUMA_SP_FUNC_DEF(8,   8, cpu); \
@@ -632,20 +661,8 @@
         p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
         p.cvt16to32_shl = x265_cvt16to32_shl_sse2;
 
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_4x8] = x265_pixel_sub_ps_2x4_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_4x16] = x265_pixel_sub_ps_2x8_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_8x4] = x265_pixel_sub_ps_4x2_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_8x8] = x265_pixel_sub_ps_4x4_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_8x16] = x265_pixel_sub_ps_4x8_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_8x32] = x265_pixel_sub_ps_4x16_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_12x16] = x265_pixel_sub_ps_6x8_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_16x4] = x265_pixel_sub_ps_8x2_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_16x8] = x265_pixel_sub_ps_8x4_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_16x12] = x265_pixel_sub_ps_8x6_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_16x16] = x265_pixel_sub_ps_8x8_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_16x32] = x265_pixel_sub_ps_8x16_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_16x64] = x265_pixel_sub_ps_8x32_sse2;
-        p.chroma[X265_CSP_I420].sub_ps[LUMA_24x32] = x265_pixel_sub_ps_12x16_sse2;
+        CHROMA_PIXELSUB_PS(_sse2);
+        LUMA_PIXELSUB(_sse2);
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -843,6 +860,7 @@
         LUMA_SSE_SP(_sse4);
 
         CHROMA_PIXELSUB_PS(_sse4);
+        LUMA_PIXELSUB(_sse4);
 
         CHROMA_FILTERS(_sse4);
         LUMA_FILTERS(_sse4);
diff -r 6ed44381cb6c -r 9a9eaf4dadd7 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Thu Dec 05 16:05:53 2013 -0600
+++ b/source/common/x86/pixel-util.h	Fri Dec 06 18:23:12 2013 +0530
@@ -122,6 +122,7 @@
 CHROMA_PIXELSUB_DEF(_sse4);
 LUMA_PIXELSUB_DEF(_sse4);
 CHROMA_PIXELSUB_DEF(_sse2);
+LUMA_PIXELSUB_DEF(_sse2);
 
 #define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
     uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel * pix, intptr_t pixstride);
diff -r 6ed44381cb6c -r 9a9eaf4dadd7 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Dec 05 16:05:53 2013 -0600
+++ b/source/common/x86/pixel-util8.asm	Fri Dec 06 18:23:12 2013 +0530
@@ -2937,22 +2937,63 @@
 ; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
 %macro PIXELSUB_PS_W16_H4 2
-INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+    add    r1,     r1
+    mov    r6d,    %2/4
+    add     r4,    r4
+    add     r5,    r5
+.loop
+    movu     m0,    [r2]
+    movu     m1,    [r3]
+    movu     m2,    [r2 + 16]
+    movu     m3,    [r3 + 16]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    movu     m4,    [r2 + r4]
+    movu     m5,    [r3 + r5]
+    movu     m1,    [r2 + r4 + 16]
+    movu     m3,    [r3 + r5 + 16]
+    psubw    m4,    m5
+    psubw    m1,    m3
+    lea      r2,    [r2 + 2 * r4]
+    lea      r3,    [r3 + 2 * r5]
+
+    movu    [r0],              m0
+    movu    [r0 + 16],         m2
+    movu    [r0 + r1],         m4
+    movu    [r0 + r1 + 16],    m1
+
+    movu     m0,    [r2]
+    movu     m1,    [r3]
+    movu     m2,    [r2 + 16]
+    movu     m3,    [r3 + 16]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    movu     m4,    [r2 + r4]
+    movu     m5,    [r3 + r5]
+    movu     m1,    [r2 + r4 + 16]
+    movu     m3,    [r3 + r5 + 16]
+    psubw    m4,    m5
+    psubw    m1,    m3
+    lea     r0,     [r0 + 2 * r1]
+
+    movu    [r0],              m0
+    movu    [r0 + 16],         m2
+    movu    [r0 + r1],         m4
+    movu    [r0 + r1 + 16],    m1
+%else
 cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add    r1,     r1
-mov    r6d,    %2/4
-pxor   m6,     m6
-
+    add    r1,     r1
+    mov    r6d,    %2/4
+    pxor   m6,     m6
 .loop
-
     movu         m1,    [r2]
     pmovzxbw     m0,    m1
     punpckhbw    m1,    m6
     movu         m3,    [r3]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     psubw        m0,    m2
     psubw        m1,    m3
 
@@ -2962,7 +3003,6 @@
     movu         m3,    [r3 + r5]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     psubw        m4,    m2
     psubw        m5,    m3
 
@@ -2977,62 +3017,96 @@
     movu         m3,    [r3 + 2 * r5]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     lea          r2,    [r2 + 2 * r4]
     lea          r3,    [r3 + 2 * r5]
-
     psubw        m0,    m2
     psubw        m1,    m3
-
     movu         m5,    [r2 + r4]
     pmovzxbw     m4,    m5
     punpckhbw    m5,    m6
     movu         m3,    [r3 + r5]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     psubw        m4,    m2
     psubw        m5,    m3
 
     movu    [r0 + 2 * r1],         m0
     movu    [r0 + 2 * r1 + 16],    m1
-
     lea     r0,                    [r0 + 2 * r1]
-
     movu    [r0 + r1],             m4
     movu    [r0 + r1 + 16],        m5
-
-    lea     r2,                    [r2 + 2 * r4]
-    lea     r3,                    [r3 + 2 * r5]
-    lea     r0,                    [r0 + 2 * r1]
-
+%endif
     dec    r6d
-
-jnz    .loop
+    lea    r2,    [r2 + 2 * r4]
+    lea    r3,    [r3 + 2 * r5]
+    lea    r0,    [r0 + 2 * r1]
+    jnz    .loop
 
 RET
 %endmacro
 
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
 PIXELSUB_PS_W16_H4 16, 4
 PIXELSUB_PS_W16_H4 16, 8
 PIXELSUB_PS_W16_H4 16, 12
 PIXELSUB_PS_W16_H4 16, 16
 PIXELSUB_PS_W16_H4 16, 32
 PIXELSUB_PS_W16_H4 16, 64
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W16_H4 16, 4
+PIXELSUB_PS_W16_H4 16, 8
+PIXELSUB_PS_W16_H4 16, 12
+PIXELSUB_PS_W16_H4 16, 16
+PIXELSUB_PS_W16_H4 16, 32
+PIXELSUB_PS_W16_H4 16, 64
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
 %macro PIXELSUB_PS_W24_H2 2
-INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+    add     r1,     r1
+    mov     r6d,    %2/2
+    add     r4,     r4
+    add     r5,     r5
+.loop
+    movu     m0,    [r2]
+    movu     m1,    [r3]
+    movu     m2,    [r2 + 16]
+    movu     m3,    [r3 + 16]
+    movu     m4,    [r2 + 32]
+    movu     m5,    [r3 + 32]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+
+    movu    [r0],              m0
+    movu    [r0 + 16],         m2
+    movu    [r0 + 32],         m4
+
+    movu     m0,    [r2 + r4]
+    movu     m1,    [r3 + r5]
+    movu     m2,    [r2 + r4 + 16]
+    movu     m3,    [r3 + r5 + 16]
+    movu     m4,    [r2 + r4 + 32]
+    movu     m5,    [r3 + r5 + 32]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+
+    movu    [r0 + r1],         m0
+    movu    [r0 + r1 + 16],    m2
+    movu    [r0 + r1 + 32],    m4
+%else
 cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add    r1,     r1
-mov    r6d,    %2/2
-pxor   m6,     m6
-
+    add    r1,     r1
+    mov    r6d,    %2/2
+    pxor   m6,     m6
 .loop
-
     movu         m1,    [r2]
     pmovzxbw     m0,    m1
     punpckhbw    m1,    m6
@@ -3043,7 +3117,6 @@
     punpckhbw    m4,    m6
     movh         m5,    [r3 + 16]
     pmovzxbw     m5,    m5
-
     psubw        m0,    m3
     psubw        m1,    m4
     psubw        m2,    m5
@@ -3062,7 +3135,6 @@
     punpckhbw    m4,    m6
     movh         m5,    [r3 + r5 + 16]
     pmovzxbw     m5,    m5
-
     psubw        m0,    m3
     psubw        m1,    m4
     psubw        m2,    m5
@@ -3070,54 +3142,93 @@
     movu    [r0 + r1],         m0
     movu    [r0 + r1 + 16],    m1
     movu    [r0 + r1 + 32],    m2
-
+%endif
+
+    dec    r6d
     lea    r2,    [r2 + 2 * r4]
     lea    r3,    [r3 + 2 * r5]
     lea    r0,    [r0 + 2 * r1]
-
-    dec    r6d
-
-jnz    .loop
+    jnz    .loop
 
 RET
 %endmacro
 
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
 PIXELSUB_PS_W24_H2 24, 32
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W24_H2 24, 32
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
 %macro PIXELSUB_PS_W32_H2 2
-INIT_XMM sse4
-cglobal pixel_sub_ps_%1x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add    r1,     r1
-mov    r6d,    %2/2
-
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+    add     r1,     r1
+    mov     r6d,    %2/2
+%if HIGH_BIT_DEPTH
+    add     r4,     r4
+    add     r5,     r5
 .loop
-
+    movu     m0,    [r2]
+    movu     m1,    [r3]
+    movu     m2,    [r2 + 16]
+    movu     m3,    [r3 + 16]
+    movu     m4,    [r2 + 32]
+    movu     m5,    [r3 + 32]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+    movu     m3,    [r2 + 48]
+    movu     m5,    [r3 + 48]
+    psubw    m3,    m5
+
+    movu    [r0],         m0
+    movu    [r0 + 16],    m2
+    movu    [r0 + 32],    m4
+    movu    [r0 + 48],    m3
+
+    movu     m0,    [r2 + r4]
+    movu     m1,    [r3 + r5]
+    movu     m2,    [r2 + r4 + 16]
+    movu     m3,    [r3 + r5 + 16]
+    movu     m4,    [r2 + r4 + 32]
+    movu     m5,    [r3 + r5 + 32]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+    movu     m3,    [r2 + r4 + 48]
+    movu     m5,    [r3 + r5 + 48]
+    psubw    m3,    m5
+
+    movu    [r0 + r1],         m0
+    movu    [r0 + r1 + 16],    m2
+    movu    [r0 + r1 + 32],    m4
+    movu    [r0 + r1 + 48],    m3
+%else
+.loop
     movh        m0,    [r2]
     movh        m1,    [r2 + 8]
     movh        m2,    [r2 + 16]
-    movh        m3,    [r2 + 24]
-    movh        m4,    [r3]
-    movh        m5,    [r3 + 8]
-    movh        m6,    [r3 + 16]
-    movh        m7,    [r3 + 24]
-
+    movh        m3,    [r3]
+    movh        m4,    [r3 + 8]
+    movh        m5,    [r3 + 16]
     pmovzxbw    m0,    m0
     pmovzxbw    m1,    m1
     pmovzxbw    m2,    m2
     pmovzxbw    m3,    m3
     pmovzxbw    m4,    m4
     pmovzxbw    m5,    m5
-    pmovzxbw    m6,    m6
-    pmovzxbw    m7,    m7
-
-    psubw       m0,    m4
-    psubw       m1,    m5
-    psubw       m2,    m6
-    psubw       m3,    m7
+    psubw       m0,    m3
+    psubw       m1,    m4
+    psubw       m2,    m5
+    movh        m3,    [r2 + 24]
+    movh        m4,    [r3 + 24]
+    pmovzxbw    m4,    m4
+    pmovzxbw    m3,    m3
+    psubw       m3,    m4
 
     movu    [r0],         m0
     movu    [r0 + 16],    m1
@@ -3127,61 +3238,126 @@
     movh        m0,    [r2 + r4]
     movh        m1,    [r2 + r4 + 8]
     movh        m2,    [r2 + r4 + 16]
-    movh        m3,    [r2 + r4 + 24]
-    movh        m4,    [r3 + r5]
-    movh        m5,    [r3 + r5 + 8]
-    movh        m6,    [r3 + r5 + 16]
-    movh        m7,    [r3 + r5 + 24]
-
+    movh        m3,    [r3 + r5]
+    movh        m4,    [r3 + r5 + 8]
+    movh        m5,    [r3 + r5 + 16]
     pmovzxbw    m0,    m0
     pmovzxbw    m1,    m1
     pmovzxbw    m2,    m2
     pmovzxbw    m3,    m3
     pmovzxbw    m4,    m4
     pmovzxbw    m5,    m5
-    pmovzxbw    m6,    m6
-    pmovzxbw    m7,    m7
-
-    psubw       m0,    m4
-    psubw       m1,    m5
-    psubw       m2,    m6
-    psubw       m3,    m7
+    psubw       m0,    m3
+    psubw       m1,    m4
+    psubw       m2,    m5
+    movh        m3,    [r2 + r4 + 24]
+    movh        m4,    [r3 + r5 + 24]
+    pmovzxbw    m3,    m3
+    pmovzxbw    m4,    m4
+    psubw       m3,    m4
 
     movu    [r0 + r1],         m0
     movu    [r0 + r1 + 16],    m1
     movu    [r0 + r1 + 32],    m2
     movu    [r0 + r1 + 48],    m3
-
+%endif
+    dec    r6d
     lea    r2,    [r2 + 2 * r4]
     lea    r3,    [r3 + 2 * r5]
     lea    r0,    [r0 + 2 * r1]
-
-    dec    r6d
-
-jnz    .loop
-
-RET
+    jnz    .loop
+    RET
 %endmacro
 
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
 PIXELSUB_PS_W32_H2 32, 8
 PIXELSUB_PS_W32_H2 32, 16
 PIXELSUB_PS_W32_H2 32, 24
 PIXELSUB_PS_W32_H2 32, 32
 PIXELSUB_PS_W32_H2 32, 64
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W32_H2 32, 8
+PIXELSUB_PS_W32_H2 32, 16
+PIXELSUB_PS_W32_H2 32, 24
+PIXELSUB_PS_W32_H2 32, 32
+PIXELSUB_PS_W32_H2 32, 64
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
 %macro PIXELSUB_PS_W48_H2 2
-INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+    add     r1,     r1
+    mov     r6d,    %2/2
+    add     r4,     r4
+    add     r5,     r5
+.loop
+    movu     m0,    [r2]
+    movu     m1,    [r3]
+    movu     m2,    [r2 + 16]
+    movu     m3,    [r3 + 16]
+    movu     m4,    [r2 + 32]
+    movu     m5,    [r3 + 32]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+
+    movu    [r0],         m0
+    movu    [r0 + 16],    m2
+    movu    [r0 + 32],    m4
+
+    movu     m0,    [r2 + 48]
+    movu     m1,    [r3 + 48]
+    movu     m2,    [r2 + 64]
+    movu     m3,    [r3 + 64]
+    movu     m4,    [r2 + 80]
+    movu     m5,    [r3 + 80]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+
+    movu    [r0 + 48],    m0
+    movu    [r0 + 64],    m2
+    movu    [r0 + 80],    m4
+
+    movu     m0,    [r2 + r4]
+    movu     m1,    [r3 + r5]
+    movu     m2,    [r2 + r4 + 16]
+    movu     m3,    [r3 + r5 + 16]
+    movu     m4,    [r2 + r4 + 32]
+    movu     m5,    [r3 + r5 + 32]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+
+    movu    [r0 + r1],         m0
+    movu    [r0 + r1 + 16],    m2
+    movu    [r0 + r1 + 32],    m4
+
+    movu     m0,    [r2 + r4 + 48]
+    movu     m1,    [r3 + r5 + 48]
+    movu     m2,    [r2 + r4 + 64]
+    movu     m3,    [r3 + r5 + 64]
+    movu     m4,    [r2 + r4 + 80]
+    movu     m5,    [r3 + r5 + 80]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+
+    movu    [r0 + r1 + 48],    m0
+    movu    [r0 + r1 + 64],    m2
+    movu    [r0 + r1 + 80],    m4
+%else
+
 cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add    r1,     r1
-mov    r6d,    %2/2
-pxor   m6,    m6
-
+    add     r1,     r1
+    mov     r6d,    %2/2
+    pxor    m6,     m6
 .loop
-
     movu         m1,    [r2]
     pmovzxbw     m0,    m1
     punpckhbw    m1,    m6
@@ -3191,7 +3367,6 @@
     movu         m5,    [r2 + 16]
     pmovzxbw     m4,    m5
     punpckhbw    m5,    m6
-
     psubw        m0,    m2
     psubw        m1,    m3
 
@@ -3201,7 +3376,6 @@
     movu         m3,    [r3 + 16]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     psubw        m4,    m2
     psubw        m5,    m3
 
@@ -3214,7 +3388,6 @@
     movu         m3,    [r3 + 32]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     psubw        m0,    m2
     psubw        m1,    m3
 
@@ -3230,7 +3403,6 @@
     movu         m5,    [r2 + r5 + 16]
     pmovzxbw     m4,    m5
     punpckhbw    m5,    m6
-
     psubw        m0,    m2
     psubw        m1,    m3
 
@@ -3240,7 +3412,6 @@
     movu         m3,    [r3 + r4 + 16]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     psubw        m4,    m2
     psubw        m5,    m3
 
@@ -3253,39 +3424,119 @@
     movu         m3,    [r3 + r5 + 32]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     psubw        m0,    m2
     psubw        m1,    m3
 
     movu    [r0 + r1 + 64],    m0
     movu    [r0 + r1 + 80],    m1
-
-    lea     r2,                [r2 + 2 * r4]
-    lea     r3,                [r3 + 2 * r5]
-    lea     r0,                [r0 + 2 * r1]
-
+%endif
     dec    r6d
-
-jnz    .loop
+    lea    r2,    [r2 + 2 * r4]
+    lea    r3,    [r3 + 2 * r5]
+    lea    r0,    [r0 + 2 * r1]
+    jnz    .loop
 
 RET
 %endmacro
 
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
 PIXELSUB_PS_W48_H2 48, 64
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W48_H2 48, 64
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
 %macro PIXELSUB_PS_W64_H2 2
-INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+    add     r1,     r1
+    mov     r6d,    %2/2
+    add     r4,     r4
+    add     r5,     r5
+.loop
+    movu     m0,    [r2]
+    movu     m1,    [r3]
+    movu     m2,    [r2 + 16]
+    movu     m3,    [r3 + 16]
+    movu     m4,    [r2 + 32]
+    movu     m5,    [r3 + 32]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+    movu     m3,    [r2 + 48]
+    movu     m5,    [r3 + 48]
+    psubw    m3,    m5
+
+    movu    [r0],         m0
+    movu    [r0 + 16],    m2
+    movu    [r0 + 32],    m4
+    movu    [r0 + 48],    m3
+
+    movu     m0,    [r2 + 64]
+    movu     m1,    [r3 + 64]
+    movu     m2,    [r2 + 80]
+    movu     m3,    [r3 + 80]
+    movu     m4,    [r2 + 96]
+    movu     m5,    [r3 + 96]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+    movu     m3,    [r2 + 112]
+    movu     m5,    [r3 + 112]
+    psubw    m3,    m5
+
+    movu    [r0 + 64],     m0
+    movu    [r0 + 80],     m2
+    movu    [r0 + 96],     m4
+    movu    [r0 + 112],    m3
+
+    movu     m0,    [r2 + r4]
+    movu     m1,    [r3 + r5]
+    movu     m2,    [r2 + r4 + 16]
+    movu     m3,    [r3 + r5 + 16]
+    movu     m4,    [r2 + r4 + 32]
+    movu     m5,    [r3 + r5 + 32]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+    movu     m3,    [r2 + r4 + 48]
+    movu     m5,    [r3 + r5 + 48]
+    psubw    m3,    m5
+
+    movu    [r0 + r1],         m0
+    movu    [r0 + r1 + 16],    m2
+    movu    [r0 + r1 + 32],    m4
+    movu    [r0 + r1 + 48],    m3
+
+    movu     m0,    [r2 + r4 + 64]
+    movu     m1,    [r3 + r5 + 64]
+    movu     m2,    [r2 + r4 + 80]
+    movu     m3,    [r3 + r5 + 80]
+    movu     m4,    [r2 + r4 + 96]
+    movu     m5,    [r3 + r5 + 96]
+    psubw    m0,    m1
+    psubw    m2,    m3
+    psubw    m4,    m5
+    movu     m3,    [r2 + r4 + 112]
+    movu     m5,    [r3 + r5 + 112]
+    psubw    m3,    m5
+
+    movu    [r0 + r1 + 64],     m0
+    movu    [r0 + r1 + 80],     m2
+    movu    [r0 + r1 + 96],     m4
+    movu    [r0 + r1 + 112],    m3
+
+%else
+
 cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add    r1,     r1
-mov    r6d,    %2/2
-pxor   m6,    m6
-
+    add     r1,     r1
+    mov     r6d,    %2/2
+    pxor    m6,     m6
 .loop
-
     movu         m1,    [r2]
     pmovzxbw     m0,    m1
     punpckhbw    m1,    m6
@@ -3295,7 +3546,6 @@
     movu         m5,    [r2 + 16]
     pmovzxbw     m4,    m5
     punpckhbw    m5,    m6
-
     psubw        m0,    m2
     psubw        m1,    m3
 
@@ -3308,7 +3558,6 @@
     movu         m3,    [r2 + 32]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     psubw        m4,    m0
     psubw        m5,    m1
 
@@ -3321,7 +3570,6 @@
     movu         m1,    [r2 + 48]
     pmovzxbw     m0,    m1
     punpckhbw    m1,    m6
-
     psubw        m2,    m4
     psubw        m3,    m5
 
@@ -3334,7 +3582,6 @@
     movu         m5,    [r2 + r4]
     pmovzxbw     m4,    m5
     punpckhbw    m5,    m6
-
     psubw        m0,    m2
     psubw        m1,    m3
 
@@ -3347,7 +3594,6 @@
     movu         m3,    [r2 + r4 + 16]
     pmovzxbw     m2,    m3
     punpckhbw    m3,    m6
-
     psubw        m4,    m0
     psubw        m5,    m1
 
@@ -3360,7 +3606,6 @@
     movu         m1,    [r2 + r4 + 32]
     pmovzxbw     m0,    m1
     punpckhbw    m1,    m6
-
     psubw        m2,    m4
     psubw        m3,    m5
 
@@ -3373,7 +3618,6 @@
     movu         m5,    [r2 + r4 + 48]
     pmovzxbw     m4,    m5
     punpckhbw    m5,    m6
-
     psubw        m0,    m2
     psubw        m1,    m3
 
@@ -3383,28 +3627,33 @@
     movu         m1,    [r3 + r5 + 48]
     pmovzxbw     m0,    m1
     punpckhbw    m1,    m6
-
     psubw        m4,    m0
     psubw        m5,    m1
 
     movu    [r0 + r1 + 96],     m4
     movu    [r0 + r1 + 112],    m5
-
-    lea     r2,                 [r2 + 2 * r4]
-    lea     r3,                 [r3 + 2 * r5]
-    lea     r0,                 [r0 + 2 * r1]
-
+%endif
     dec    r6d
-
-jnz    .loop
-
-RET
+    lea    r2,    [r2 + 2 * r4]
+    lea    r3,    [r3 + 2 * r5]
+    lea    r0,    [r0 + 2 * r1]
+    jnz    .loop
+    RET
 %endmacro
 
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
 PIXELSUB_PS_W64_H2 64, 16
 PIXELSUB_PS_W64_H2 64, 32
 PIXELSUB_PS_W64_H2 64, 48
 PIXELSUB_PS_W64_H2 64, 64
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W64_H2 64, 16
+PIXELSUB_PS_W64_H2 64, 32
+PIXELSUB_PS_W64_H2 64, 48
+PIXELSUB_PS_W64_H2 64, 64
+%endif
 
 ;=============================================================================
 ; variance


More information about the x265-devel mailing list