[x265] [PATCH] asm: code for addAvg luma and chroma all sizes

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Sat Jan 18 08:39:24 CET 2014


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1390030530 -19800
#      Sat Jan 18 13:05:30 2014 +0530
# Node ID 806eb643b88cfef7752f45ee02c9d7f3d9bf2b27
# Parent  2e4e32a50cd6d80dbcf96338c934e5e5e1f05be5
asm: code for addAvg luma and chroma all sizes

diff -r 2e4e32a50cd6 -r 806eb643b88c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Sat Jan 18 13:05:30 2014 +0530
@@ -547,6 +547,62 @@
     SETUP_PIXEL_SSE_SP_DEF(64, 16, cpu); \
     SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
 
+#define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+    p.luma_addAvg[LUMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define LUMA_ADDAVG(cpu) \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(4,  8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(4,  16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  4,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 4,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 64, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 64, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(48, 64, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 48, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
+
+#define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+    p.chroma_addAvg[CHROMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define CHROMA_ADDAVG(cpu) \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  2,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  2,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  6,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+
 #define SETUP_INTRA_ANG4(mode, fno, cpu) \
     p.intra_pred[BLOCK_4x4][mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
 #define SETUP_INTRA_ANG8(mode, fno, cpu) \
@@ -883,6 +939,9 @@
         p.dct[DCT_4x4] = x265_dct4_sse2;
         p.idct[IDCT_4x4] = x265_idct4_sse2;
         p.idct[IDST_4x4] = x265_idst4_sse2;
+
+        LUMA_ADDAVG(_sse2);
+        CHROMA_ADDAVG(_sse2);
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -1012,6 +1071,10 @@
         SETUP_INTRA_ANG4(33, 3, sse4);
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
+
+        p.chroma_addAvg[CHROMA_2x4]  = x265_addAvg_2x4_sse4;
+        p.chroma_addAvg[CHROMA_2x8]  = x265_addAvg_2x8_sse4;
+        p.chroma_addAvg[CHROMA_6x8]  = x265_addAvg_6x8_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 2e4e32a50cd6 -r 806eb643b88c source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/mc-a.asm	Sat Jan 18 13:05:30 2014 +0530
@@ -39,6 +39,9 @@
              times 8 db 2
              times 8 db 4
              times 8 db 6
+
+OFFSET       times 8 dw 4040h
+
 sq_1: times 1 dq 1
 
 SECTION .text
@@ -56,6 +59,1008 @@
 cextern pd_32
 cextern deinterleave_shufd
 
+;=================================================================================
+;void addAvg (pixel *dst, intptr_t dstStride, int16_t* src0, intptr_t src0Stride,
+;                                             int16_t* src1, intptr_t src1Stride)
+;=================================================================================
+; r0 = dst,  r1 = dstStride
+; r2 = src0, r3 = src0Stride
+; r4 = src1, r5 = src1Stride
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x4, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova          m0,          [OFFSET]
+
+    movd          m1,          [r2]
+    movd          m2,          [r2 + 2 * r3]
+    movd          m3,          [r4]
+    movd          m4,          [r4 + 2 * r3]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r2,          [r2 + 4 * r3]
+    lea           r4,          [r4 + 4 * r5]
+
+    movd          m2,          [r2]
+    movd          m4,          [r2 + 2 * r3]
+    movd          m5,          [r4]
+    movd          m6,          [r4 + 2 * r3]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    paddw         m1,          m0
+    psraw         m1,          7
+    packuswb      m1,          m1
+
+    pextrw        [r0],        m1, 0
+    pextrw        [r0 + r1],   m1, 1
+    lea           r0,          [r0 + 2 * r1]
+    pextrw        [r0],        m1, 2
+    pextrw        [r0 + r1],   m1, 3
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x8, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova          m0,          [OFFSET]
+
+    movd          m1,          [r2]
+    movd          m2,          [r2 + 2 * r3]
+    movd          m3,          [r4]
+    movd          m4,          [r4 + 2 * r3]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r2,          [r2 + 4 * r3]
+    lea           r4,          [r4 + 4 * r5]
+
+    movd          m2,          [r2]
+    movd          m4,          [r2 + 2 * r3]
+    movd          m5,          [r4]
+    movd          m6,          [r4 + 2 * r3]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    paddw         m1,          m0
+    psraw         m1,          7
+    packuswb      m1,          m1
+
+    pextrw        [r0],        m1, 0
+    pextrw        [r0 + r1],   m1, 1
+    lea           r0,          [r0 + 2 * r1]
+    pextrw        [r0],        m1, 2
+    pextrw        [r0 + r1],   m1, 3
+
+    lea            r0,          [r0 + 2 * r1]
+    lea            r2,          [r2 + 4 * r3]
+    lea            r4,          [r4 + 4 * r5]
+
+    movd          m1,          [r2]
+    movd          m2,          [r2 + 2 * r3]
+    movd          m3,          [r4]
+    movd          m4,          [r4 + 2 * r3]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r2,          [r2 + 4 * r3]
+    lea           r4,          [r4 + 4 * r5]
+
+    movd          m2,          [r2]
+    movd          m4,          [r2 + 2 * r3]
+    movd          m5,          [r4]
+    movd          m6,          [r4 + 2 * r3]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    paddw         m1,          m0
+    psraw         m1,          7
+    packuswb      m1,          m1
+
+    pextrw        [r0],        m1, 0
+    pextrw        [r0 + r1],   m1, 1
+    lea           r0,          [r0 + 2 * r1]
+    pextrw        [r0],        m1, 2
+    pextrw        [r0 + r1],   m1, 3
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_4x2, 6,4,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova           m4,          [OFFSET]
+
+    movh           m0,          [r2]
+    movh           m1,          [r2 + 2 * r3]
+    movh           m2,          [r4]
+    movh           m3,          [r4 + 2 * r5]
+
+    punpcklqdq     m0,          m1
+    punpcklqdq     m2,          m3
+
+    paddw          m0,          m2
+    paddw          m0,          m4
+    psraw          m0,          7
+
+    packuswb       m0,          m0
+    movd           [r0],        m0
+    pshufd         m0,          m0, 1
+    movd           [r0 + r1],   m0
+
+    RET
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W4_H4 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova           m4,          [OFFSET]
+
+    mov            r6d,         %2/4
+
+.loop
+    movh           m0,          [r2]
+    movh           m1,          [r2 + 2 * r3]
+    movh           m2,          [r4]
+    movh           m3,          [r4 + 2 * r5]
+
+    punpcklqdq     m0,          m1
+    punpcklqdq     m2,          m3
+
+    paddw          m0,          m2
+    paddw          m0,          m4
+    psraw          m0,          7
+
+    packuswb       m0,          m0
+    movd           [r0],        m0
+    pshufd         m0,          m0, 1
+    movd           [r0 + r1],   m0
+
+    lea            r0,          [r0 + 2 * r1]
+    lea            r2,          [r2 + 4 * r3]
+    lea            r4,          [r4 + 4 * r5]
+
+    movh           m0,          [r2]
+    movh           m1,          [r2 + 2 * r3]
+    movh           m2,          [r4]
+    movh           m3,          [r4 + 2 * r5]
+
+    punpcklqdq     m0,          m1
+    punpcklqdq     m2,          m3
+
+    paddw          m0,          m2
+    paddw          m0,          m4
+    psraw          m0,          7
+
+    packuswb       m0,          m0
+    movd           [r0],        m0
+    pshufd         m0,          m0, 1
+    movd           [r0 + r1],   m0
+
+    lea            r0,          [r0 + 2 * r1]
+    lea            r2,          [r2 + 4 * r3]
+    lea            r4,          [r4 + 4 * r5]
+
+    dec         r6d
+    jnz         .loop
+
+    RET
+
+%endmacro
+
+ADDAVG_W4_H4 4, 4
+ADDAVG_W4_H4 4, 8
+ADDAVG_W4_H4 4, 16
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_6x8, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova        m4,          [OFFSET]
+
+    movu        m0,             [r2]
+    movu        m2,             [r4]
+    paddw       m0,             m2
+    paddw       m0,             m4
+    psraw       m0,             7
+    packuswb    m0,             m0
+    movd        [r0],           m0
+    pextrw      [r0 + 4],       m0, 2
+
+    movu        m1,             [r2 + 2 * r3]
+    movu        m3,             [r4 + 2 * r5]
+    paddw       m1,             m3
+    paddw       m1,             m4
+    psraw       m1,             7
+    packuswb    m1,             m1
+    movd        [r0 + r1],      m1
+    pextrw      [r0 + r1 + 4],  m1, 2
+
+    lea         r0,             [r0 + 2 * r1]
+    lea         r2,             [r2 + 4 * r3]
+    lea         r4,             [r4 + 4 * r5]
+
+    movu        m0,             [r2]
+    movu        m2,             [r4]
+    paddw       m0,             m2
+    paddw       m0,             m4
+    psraw       m0,             7
+    packuswb    m0,             m0
+    movd        [r0],           m0
+    pextrw      [r0 + 4],       m0, 2
+
+    movu        m1,             [r2 + 2 * r3]
+    movu        m3,             [r4 + 2 * r5]
+    paddw       m1,             m3
+    paddw       m1,             m4
+    psraw       m1,             7
+    packuswb    m1,             m1
+    movd        [r0 + r1],      m1
+    pextrw      [r0 + r1 + 4],  m1, 2
+
+    lea         r0,             [r0 + 2 * r1]
+    lea         r2,             [r2 + 4 * r3]
+    lea         r4,             [r4 + 4 * r5]
+
+    movu        m0,             [r2]
+    movu        m2,             [r4]
+    paddw       m0,             m2
+    paddw       m0,             m4
+    psraw       m0,             7
+    packuswb    m0,             m0
+    movd        [r0],           m0
+    pextrw      [r0 + 4],       m0, 2
+
+    movu        m1,             [r2 + 2 * r3]
+    movu        m3,             [r4 + 2 * r5]
+    paddw       m1,             m3
+    paddw       m1,             m4
+    psraw       m1,             7
+    packuswb    m1,             m1
+    movd        [r0 + r1],      m1
+    pextrw      [r0 + r1 + 4],  m1, 2
+
+    lea         r0,             [r0 + 2 * r1]
+    lea         r2,             [r2 + 4 * r3]
+    lea         r4,             [r4 + 4 * r5]
+
+    movu        m0,             [r2]
+    movu        m2,             [r4]
+    paddw       m0,             m2
+    paddw       m0,             m4
+    psraw       m0,             7
+    packuswb    m0,             m0
+    movd        [r0],           m0
+    pextrw      [r0 + 4],       m0, 2
+
+    movu        m1,             [r2 + 2 * r3]
+    movu        m3,             [r4 + 2 * r5]
+    paddw       m1,             m3
+    paddw       m1,             m4
+    psraw       m1,             7
+    packuswb    m1,             m1
+    movd        [r0 + r1],      m1
+    pextrw      [r0 + r1 + 4],  m1, 2
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x2, 6,6,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova        m4,          [OFFSET]
+
+    movu        m0,          [r2]
+    movu        m2,          [r4]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r0],        m0
+
+    movu        m1,          [r2 + 2 * r3]
+    movu        m3,          [r4 + 2 * r5]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r0 + r1],   m1
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x6, 6,6,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova        m4,          [OFFSET]
+
+    movu        m0,          [r2]
+    movu        m2,          [r4]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r0],        m0
+
+    movu        m1,          [r2 + 2 * r3]
+    movu        m3,          [r4 + 2 * r5]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r0 + r1],   m1
+
+    lea         r0,          [r0 + 2 * r1]
+    lea         r2,          [r2 + 4 * r3]
+    lea         r4,          [r4 + 4 * r5]
+
+    movu        m0,          [r2]
+    movu        m2,          [r4]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r0],        m0
+
+    movu        m1,          [r2 + 2 * r3]
+    movu        m3,          [r4 + 2 * r5]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r0 + r1],   m1
+
+    lea         r0,          [r0 + 2 * r1]
+    lea         r2,          [r2 + 4 * r3]
+    lea         r4,          [r4 + 4 * r5]
+
+    movu        m0,          [r2]
+    movu        m2,          [r4]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r0],        m0
+
+    movu        m1,          [r2 + 2 * r3]
+    movu        m3,          [r4 + 2 * r5]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r0 + r1],   m1
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+
+%macro ADDAVG_W8_H4 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova        m4,          [OFFSET]
+
+    mov         r6d,         %2/4
+
+.loop
+    movu        m0,          [r2]
+    movu        m2,          [r4]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r0],        m0
+
+    movu        m1,          [r2 + 2 * r3]
+    movu        m3,          [r4 + 2 * r5]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r0 + r1],   m1
+
+    lea         r0,          [r0 + 2 * r1]
+    lea         r2,          [r2 + 4 * r3]
+    lea         r4,          [r4 + 4 * r5]
+
+    movu        m0,          [r2]
+    movu        m2,          [r4]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r0],        m0
+
+    movu        m1,          [r2 + 2 * r3]
+    movu        m3,          [r4 + 2 * r5]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r0 + r1],   m1
+
+    lea         r0,          [r0 + 2 * r1]
+    lea         r2,          [r2 + 4 * r3]
+    lea         r4,          [r4 + 4 * r5]
+
+    dec         r6d
+    jnz         .loop
+
+    RET
+
+%endmacro
+
+ADDAVG_W8_H4 8, 4
+ADDAVG_W8_H4 8, 8
+ADDAVG_W8_H4 8, 16
+ADDAVG_W8_H4 8, 32
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W12_H4 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova           m4,             [OFFSET]
+
+    mov            r6d,            %2/4
+
+.loop
+    movu           m0,             [r2]
+    movu           m2,             [r4]
+    paddw          m0,             m2
+    paddw          m0,             m4
+    psraw          m0,             7
+    packuswb       m0,             m0
+    movh           [r0],           m0
+
+    movh           m0,             [r2 + 16]
+    movh           m1,             [r2 + 16 + 2 * r3]
+    movh           m2,             [r4 + 16]
+    movh           m3,             [r4 + 16 + 2 * r5]
+
+    punpcklqdq     m0,             m1
+    punpcklqdq     m2,             m3
+
+    paddw          m0,             m2
+    paddw          m0,             m4
+    psraw          m0,             7
+
+    packuswb       m0,             m0
+    movd           [r0 + 8],       m0
+    pshufd         m0,             m0, 1
+    movd           [r0 + 8 + r1],  m0
+
+    movu           m1,             [r2 + 2 * r3]
+    movu           m3,             [r4 + 2 * r5]
+    paddw          m1,             m3
+    paddw          m1,             m4
+    psraw          m1,             7
+    packuswb       m1,             m1
+    movh           [r0 + r1],      m1
+
+    lea            r0,             [r0 + 2 * r1]
+    lea            r2,             [r2 + 4 * r3]
+    lea            r4,             [r4 + 4 * r5]
+
+    movu           m0,             [r2]
+    movu           m2,             [r4]
+    paddw          m0,             m2
+    paddw          m0,             m4
+    psraw          m0,             7
+    packuswb       m0,             m0
+    movh           [r0],           m0
+
+    movh           m0,             [r2 + 16]
+    movh           m1,             [r2 + 16 + 2 * r3]
+    movh           m2,             [r4 + 16]
+    movh           m3,             [r4 + 16 + 2 * r5]
+
+    punpcklqdq     m0,             m1
+    punpcklqdq     m2,             m3
+
+    paddw          m0,             m2
+    paddw          m0,             m4
+    psraw          m0,             7
+
+    packuswb       m0,             m0
+    movd           [r0 + 8],       m0
+    pshufd         m0,             m0,  1
+    movd           [r0 + 8 + r1],  m0
+
+    movu           m1,             [r2 + 2 * r3]
+    movu           m3,             [r4 + 2 * r5]
+    paddw          m1,             m3
+    paddw          m1,             m4
+    psraw          m1,             7
+    packuswb       m1,             m1
+    movh           [r0 + r1],      m1
+
+    lea            r0,             [r0 + 2 * r1]
+    lea            r2,             [r2 + 4 * r3]
+    lea            r4,             [r4 + 4 * r5]
+
+    dec            r6d
+    jnz            .loop
+
+    RET
+
+%endmacro
+
+ADDAVG_W12_H4 12, 16
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W16_H4 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova        m4,          [OFFSET]
+
+    mov         r6d,         %2/4
+
+.loop
+    movu        m0,              [r2]
+    movu        m2,              [r4]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 16]
+    movu        m2,              [r4 + 16]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0],            m0
+
+    movu        m1,              [r2 + 2 * r3]
+    movu        m3,              [r4 + 2 * r5]
+    paddw       m1,              m3
+    paddw       m1,              m4
+    psraw       m1,              7
+
+    movu        m2,              [r2 + 16 + 2 * r3]
+    movu        m3,              [r4 + 16 + 2 * r5]
+    paddw       m2,              m3
+    paddw       m2,              m4
+    psraw       m2,              7
+    packuswb    m1,              m2
+    movu        [r0 + r1],       m1
+
+    lea         r0,              [r0 + 2 * r1]
+    lea         r2,              [r2 + 4 * r3]
+    lea         r4,              [r4 + 4 * r5]
+
+    movu        m0,              [r2]
+    movu        m2,              [r4]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 16]
+    movu        m2,              [r4 + 16]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0],            m0
+
+    movu        m1,              [r2 + 2 * r3]
+    movu        m3,              [r4 + 2 * r5]
+    paddw       m1,              m3
+    paddw       m1,              m4
+    psraw       m1,              7
+
+    movu        m2,              [r2 + 16 + 2 * r3]
+    movu        m3,              [r4 + 16 + 2 * r5]
+    paddw       m2,              m3
+    paddw       m2,              m4
+    psraw       m2,              7
+    packuswb    m1,              m2
+    movu        [r0 + r1],       m1
+
+    lea         r0,              [r0 + 2 * r1]
+    lea         r2,              [r2 + 4 * r3]
+    lea         r4,              [r4 + 4 * r5]
+
+    dec         r6d
+    jnz         .loop
+
+    RET
+
+%endmacro
+
+ADDAVG_W16_H4 16, 4
+ADDAVG_W16_H4 16, 8
+ADDAVG_W16_H4 16, 12
+ADDAVG_W16_H4 16, 16
+ADDAVG_W16_H4 16, 32
+ADDAVG_W16_H4 16, 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W24_H2 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova        m4,          [OFFSET]
+
+    mov         r6d,         %2/2
+
+.loop
+    movu        m0,              [r2]
+    movu        m2,              [r4]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 16]
+    movu        m2,              [r4 + 16]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0],            m0
+
+    movu        m0,              [r2 + 32]
+    movu        m2,              [r4 + 32]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+    packuswb    m0,              m0
+    movh        [r0 + 16],       m0
+
+    movu        m1,              [r2 + 2 * r3]
+    movu        m3,              [r4 + 2 * r5]
+    paddw       m1,              m3
+    paddw       m1,              m4
+    psraw       m1,              7
+
+    movu        m2,              [r2 + 16 + 2 * r3]
+    movu        m3,              [r4 + 16 + 2 * r5]
+    paddw       m2,              m3
+    paddw       m2,              m4
+    psraw       m2,              7
+    packuswb    m1,              m2
+    movu        [r0 + r1],       m1
+
+    movu        m1,              [r2 + 32 + 2 * r3]
+    movu        m3,              [r4 + 32 + 2 * r5]
+    paddw       m1,              m3
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m1,              m1
+    movh        [r0 + 16 + r1],  m1
+
+    lea         r0,              [r0 + 2 * r1]
+    lea         r2,              [r2 + 4 * r3]
+    lea         r4,              [r4 + 4 * r5]
+
+    dec         r6d
+    jnz         .loop
+
+    RET
+
+%endmacro
+
+ADDAVG_W24_H2 24, 32
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W32_H2 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova        m4,          [OFFSET]
+
+    mov         r6d,         %2/2
+
+.loop
+    movu        m0,              [r2]
+    movu        m2,              [r4]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 16]
+    movu        m2,              [r4 + 16]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0],            m0
+
+    movu        m0,              [r2 + 32]
+    movu        m2,              [r4 + 32]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 48]
+    movu        m2,              [r4 + 48]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0 + 16],       m0
+
+    movu        m1,              [r2 + 2 * r3]
+    movu        m3,              [r4 + 2 * r5]
+    paddw       m1,              m3
+    paddw       m1,              m4
+    psraw       m1,              7
+
+    movu        m2,              [r2 + 16 + 2 * r3]
+    movu        m3,              [r4 + 16 + 2 * r5]
+    paddw       m2,              m3
+    paddw       m2,              m4
+    psraw       m2,              7
+    packuswb    m1,              m2
+    movu        [r0 + r1],       m1
+
+    movu        m1,              [r2 + 32 + 2 * r3]
+    movu        m3,              [r4 + 32 + 2 * r5]
+    paddw       m1,              m3
+    paddw       m1,              m4
+    psraw       m1,              7
+
+    movu        m2,              [r2 + 48 + 2 * r3]
+    movu        m3,              [r4 + 48 + 2 * r5]
+    paddw       m2,              m3
+    paddw       m2,              m4
+    psraw       m2,              7
+    packuswb    m1,              m2
+    movu        [r0 + 16 + r1],  m1
+
+    lea         r0,              [r0 + 2 * r1]
+    lea         r2,              [r2 + 4 * r3]
+    lea         r4,              [r4 + 4 * r5]
+
+    dec         r6d
+    jnz         .loop
+
+    RET
+
+%endmacro
+
+ADDAVG_W32_H2 32, 8
+ADDAVG_W32_H2 32, 16
+ADDAVG_W32_H2 32, 24
+ADDAVG_W32_H2 32, 32
+ADDAVG_W32_H2 32, 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W48_H2 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova        m4,          [OFFSET]
+
+    mov         r6d,         %2/2
+
+.loop
+    movu        m0,              [r2]
+    movu        m2,              [r4]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 16]
+    movu        m2,              [r4 + 16]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0],            m0
+
+    movu        m0,              [r2 + 32]
+    movu        m2,              [r4 + 32]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 48]
+    movu        m2,              [r4 + 48]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0 + 16],       m0
+
+    movu        m0,              [r2 + 64]
+    movu        m2,              [r4 + 64]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 80]
+    movu        m2,              [r4 + 80]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0 + 32],       m0
+
+    movu        m1,              [r2 + 2 * r3]
+    movu        m3,              [r4 + 2 * r5]
+    paddw       m1,              m3
+    paddw       m1,              m4
+    psraw       m1,              7
+
+    movu        m2,              [r2 + 16 + 2 * r3]
+    movu        m3,              [r4 + 16 + 2 * r5]
+    paddw       m2,              m3
+    paddw       m2,              m4
+    psraw       m2,              7
+    packuswb    m1,              m2
+    movu        [r0 + r1],       m1
+
+    movu        m1,              [r2 + 32 + 2 * r3]
+    movu        m3,              [r4 + 32 + 2 * r5]
+    paddw       m1,              m3
+    paddw       m1,              m4
+    psraw       m1,              7
+
+    movu        m2,              [r2 + 48 + 2 * r3]
+    movu        m3,              [r4 + 48 + 2 * r5]
+    paddw       m2,              m3
+    paddw       m2,              m4
+    psraw       m2,              7
+    packuswb    m1,              m2
+    movu        [r0 + 16 + r1],  m1
+
+    movu        m1,              [r2 + 64 + 2 * r3]
+    movu        m3,              [r4 + 64 + 2 * r5]
+    paddw       m1,              m3
+    paddw       m1,              m4
+    psraw       m1,              7
+
+    movu        m2,              [r2 + 80 + 2 * r3]
+    movu        m3,              [r4 + 80 + 2 * r5]
+    paddw       m2,              m3
+    paddw       m2,              m4
+    psraw       m2,              7
+    packuswb    m1,              m2
+    movu        [r0 + 32 + r1],  m1
+
+    lea         r0,              [r0 + 2 * r1]
+    lea         r2,              [r2 + 4 * r3]
+    lea         r4,              [r4 + 4 * r5]
+
+    dec         r6d
+    jnz         .loop
+
+    RET
+
+%endmacro
+
+ADDAVG_W48_H2 48, 64
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W64_H1 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,5, dst, dstStride, src0, src0Stride, src1, src1Stride
+
+    mova        m4,          [OFFSET]
+
+    mov         r6d,         %2
+
+.loop
+    movu        m0,              [r2]
+    movu        m2,              [r4]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 16]
+    movu        m2,              [r4 + 16]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0],           m0
+
+    movu        m0,              [r2 + 32]
+    movu        m2,              [r4 + 32]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 48]
+    movu        m2,              [r4 + 48]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0 + 16],       m0
+
+    movu        m0,              [r2 + 64]
+    movu        m2,              [r4 + 64]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 80]
+    movu        m2,              [r4 + 80]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0 + 32],       m0
+
+    movu        m0,              [r2 + 96]
+    movu        m2,              [r4 + 96]
+    paddw       m0,              m2
+    paddw       m0,              m4
+    psraw       m0,              7
+
+    movu        m1,              [r2 + 112]
+    movu        m2,              [r4 + 112]
+    paddw       m1,              m2
+    paddw       m1,              m4
+    psraw       m1,              7
+    packuswb    m0,              m1
+    movu        [r0 + 48],       m0
+
+    lea         r0,              [r0 + r1]
+    lea         r2,              [r2 + 2 * r3]
+    lea         r4,              [r4 + 2 * r5]
+
+    dec         r6d
+    jnz         .loop
+
+    RET
+
+%endmacro
+
+ADDAVG_W64_H1 64, 16
+ADDAVG_W64_H1 64, 32
+ADDAVG_W64_H1 64, 48
+ADDAVG_W64_H1 64, 64
+
+;-----------------------------------------------------------------------------
+
 ;=============================================================================
 ; implicit weighted biprediction
 ;=============================================================================
diff -r 2e4e32a50cd6 -r 806eb643b88c source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/pixel.h	Sat Jan 18 13:05:30 2014 +0530
@@ -166,6 +166,41 @@
 int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
 int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
 
+#define ADDAVG(func, args)  \
+    void func ## _sse2 args; \
+    void func ## _sse4 args;
+ADDAVG(x265_addAvg_2x4,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_2x8,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_4x2,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_4x4,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_4x8,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_4x16,   (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_6x8,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x2,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x4,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x6,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x8,    (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x16,   (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_8x32,   (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_12x16,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x4,   (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x8,   (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x12,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x16,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x32,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_16x64,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_24x32,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x8,   (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x16,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x24,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x32,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_32x64,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_64x16,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_64x32,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_64x48,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_64x64,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+ADDAVG(x265_addAvg_48x64,  (pixel*, intptr_t, int16_t*, intptr_t, int16_t*, intptr_t))
+
 #undef DECL_PIXELS
 #undef DECL_HEVC_SSD
 #undef DECL_X1



More information about the x265-devel mailing list