[x265] [PATCH] asm: code for addAvg luma and chroma all sizes

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Sat Jan 18 11:02:51 CET 2014


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1390037965 -19800
#      Sat Jan 18 15:09:25 2014 +0530
# Node ID a72a0900a84d48627541ab343c9e6d8a35b3db71
# Parent  4f396ba19b4729bd1257b8cd898a0157f9ed2d53
asm: code for addAvg luma and chroma all sizes

diff -r 4f396ba19b47 -r a72a0900a84d source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/Lib/TLibCommon/TComYuv.cpp	Sat Jan 18 15:09:25 2014 +0530
@@ -594,7 +594,7 @@
         src1Stride = srcYuv1->m_width;
         dststride  = getStride();
 
-        primitives.luma_addAvg[part](dstY, dststride, srcY0, src0Stride, srcY1, src1Stride);
+        primitives.luma_addAvg[part](srcY0, srcY1, dstY, src0Stride, src1Stride, dststride);
     }
     if (bChroma)
     {
@@ -602,8 +602,8 @@
         src1Stride = srcYuv1->m_cwidth;
         dststride  = getCStride();
 
-        primitives.chroma_addAvg[part](dstU, dststride, srcU0, src0Stride, srcU1, src1Stride);
-        primitives.chroma_addAvg[part](dstV, dststride, srcV0, src0Stride, srcV1, src1Stride);
+        primitives.chroma_addAvg[part](srcU0, srcU1, dstU, src0Stride, src1Stride, dststride);
+        primitives.chroma_addAvg[part](srcV0, srcV1, dstV, src0Stride, src1Stride, dststride);
     }
 }
 
diff -r 4f396ba19b47 -r a72a0900a84d source/common/pixel.cpp
--- a/source/common/pixel.cpp	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/pixel.cpp	Sat Jan 18 15:09:25 2014 +0530
@@ -802,7 +802,7 @@
 }
 
 template<int bx, int by>
-void addAvg(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t src0Stride, int16_t* src1, intptr_t src1Stride)
+void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
 {
     int shiftNum, offset;
     shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
diff -r 4f396ba19b47 -r a72a0900a84d source/common/primitives.h
--- a/source/common/primitives.h	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/primitives.h	Sat Jan 18 15:09:25 2014 +0530
@@ -203,7 +203,7 @@
 typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
 typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1);
 
-typedef void (*addAvg_t)(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t src0Stride, int16_t* src1, intptr_t src1Stride);
+typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
diff -r 4f396ba19b47 -r a72a0900a84d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Sat Jan 18 15:09:25 2014 +0530
@@ -547,6 +547,62 @@
     SETUP_PIXEL_SSE_SP_DEF(64, 16, cpu); \
     SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
 
+#define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+    p.luma_addAvg[LUMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define LUMA_ADDAVG(cpu) \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(4,  8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(4,  16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  4,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 4,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 64, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 64, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(48, 64, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 48, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
+
+#define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+    p.chroma_addAvg[CHROMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define CHROMA_ADDAVG(cpu) \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  2,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  2,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  6,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+
 #define SETUP_INTRA_ANG4(mode, fno, cpu) \
     p.intra_pred[BLOCK_4x4][mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
 #define SETUP_INTRA_ANG8(mode, fno, cpu) \
@@ -883,6 +939,9 @@
         p.dct[DCT_4x4] = x265_dct4_sse2;
         p.idct[IDCT_4x4] = x265_idct4_sse2;
         p.idct[IDST_4x4] = x265_idst4_sse2;
+
+        LUMA_ADDAVG(_sse2);
+        CHROMA_ADDAVG(_sse2);
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -1012,6 +1071,10 @@
         SETUP_INTRA_ANG4(33, 3, sse4);
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
+
+        p.chroma_addAvg[CHROMA_2x4]  = x265_addAvg_2x4_sse4;
+        p.chroma_addAvg[CHROMA_2x8]  = x265_addAvg_2x8_sse4;
+        p.chroma_addAvg[CHROMA_6x8]  = x265_addAvg_6x8_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 4f396ba19b47 -r a72a0900a84d source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/const-a.asm	Sat Jan 18 15:09:25 2014 +0530
@@ -33,10 +33,13 @@
 const pw_1,        times 16 dw 1
 const pw_16,       times 16 dw 16
 const pw_32,       times 16 dw 32
+const pw_128,      times 16 dw 128
+const pw_256,      times 16 dw 256
 const pw_512,      times 16 dw 512
 const pw_1024,     times 16 dw 1024
 const pw_4096,     times 16 dw 4096
 const pw_00ff,     times 16 dw 0x00ff
+const pw_16448,    times 16 dw 16448
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
diff -r 4f396ba19b47 -r a72a0900a84d source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/mc-a.asm	Sat Jan 18 15:09:25 2014 +0530
@@ -49,13 +49,1055 @@
 cextern pw_8
 cextern pw_32
 cextern pw_64
+cextern pw_128
+cextern pw_256
 cextern pw_512
 cextern pw_00ff
+cextern pw_16448
 cextern pw_pixel_max
 cextern sw_64
 cextern pd_32
 cextern deinterleave_shufd
 
+;====================================================================================================================
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+;====================================================================================================================
+; r0 = pSrc0,    r1 = pSrc1
+; r2 = pDst,     r3 = iStride0
+; r4 = iStride1, r5 = iDstStride
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x4, 6,7,5, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova          m0,          [pw_16448]
+    add           r3,          r3
+    add           r4,          r4
+
+    movd          m1,          [r0]
+    movd          m2,          [r0 + r3]
+    movd          m3,          [r1]
+    movd          m4,          [r1 + r4]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m2,          [r0]
+    movd          m4,          [r0 + r3]
+    movd          m5,          [r1]
+    movd          m6,          [r1 + r4]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    paddw         m1,          m0
+    psraw         m1,          7
+    packuswb      m1,          m1
+
+    pextrw        [r2],        m1, 0
+    pextrw        [r2 + r5],   m1, 1
+    lea           r2,          [r2 + 2 * r5]
+    pextrw        [r2],        m1, 2
+    pextrw        [r2 + r5],   m1, 3
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x8, 6,7,5, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova          m0,          [pw_16448]
+    add           r3,          r3
+    add           r4,          r4
+
+    movd          m1,          [r0]
+    movd          m2,          [r0 + r3]
+    movd          m3,          [r1]
+    movd          m4,          [r1 + r4]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m2,          [r0]
+    movd          m4,          [r0 + r3]
+    movd          m5,          [r1]
+    movd          m6,          [r1 + r4]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    paddw         m1,          m0
+    psraw         m1,          7
+    packuswb      m1,          m1
+
+    pextrw        [r2],        m1, 0
+    pextrw        [r2 + r5],   m1, 1
+    lea           r2,          [r2 + 2 * r5]
+    pextrw        [r2],        m1, 2
+    pextrw        [r2 + r5],   m1, 3
+
+    lea           r2,          [r2 + 2 * r5]
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m1,          [r0]
+    movd          m2,          [r0 + r3]
+    movd          m3,          [r1]
+    movd          m4,          [r1 + r4]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m2,          [r0]
+    movd          m4,          [r0 + r3]
+    movd          m5,          [r1]
+    movd          m6,          [r1 + r4]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    paddw         m1,          m0
+    psraw         m1,          7
+    packuswb      m1,          m1
+
+    pextrw        [r2],        m1, 0
+    pextrw        [r2 + r5],   m1, 1
+    lea           r2,          [r2 + 2 * r5]
+    pextrw        [r2],        m1, 2
+    pextrw        [r2 + r5],   m1, 3
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_4x2, 6,6,5, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova           m4,          [pw_16448]
+    add            r3,           r3
+    add            r4,           r4
+
+    movh           m0,          [r0]
+    movh           m1,          [r0 + r3]
+    movh           m2,          [r1]
+    movh           m3,          [r1 + r4]
+
+    punpcklqdq     m0,          m1
+    punpcklqdq     m2,          m3
+
+    paddw          m0,          m2
+    paddw          m0,          m4
+    psraw          m0,          7
+
+    packuswb       m0,          m0
+    movd           [r2],        m0
+    pshufd         m0,          m0, 1
+    movd           [r2 + r5],   m0
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W4_H4 1
+INIT_XMM sse2
+cglobal addAvg_4x%1, 6,7,5, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova           m4,          [pw_256]
+    mova           m5,          [pw_128]
+    add            r3,          r3
+    add            r4,          r4
+
+    mov            r6d,         %1/4
+
+.loop
+    movh           m0,          [r0]
+    movh           m1,          [r0 + r3]
+    movh           m2,          [r1]
+    movh           m3,          [r1 + r4]
+
+    punpcklqdq     m0,          m1
+    punpcklqdq     m2,          m3
+
+    paddw          m0,          m2
+    pmulhrsw       m0,          m4
+    paddw          m0,          m5
+
+    packuswb       m0,          m0
+    movd           [r2],        m0
+    pshufd         m0,          m0, 1
+    movd           [r2 + r5],   m0
+
+    lea            r2,          [r2 + 2 * r5]
+    lea            r0,          [r0 + 2 * r3]
+    lea            r1,          [r1 + 2 * r4]
+
+    movh           m0,          [r0]
+    movh           m1,          [r0 + r3]
+    movh           m2,          [r1]
+    movh           m3,          [r1 + r4]
+
+    punpcklqdq     m0,          m1
+    punpcklqdq     m2,          m3
+
+    paddw          m0,          m2
+    pmulhrsw       m0,          m4
+    paddw          m0,          m5
+
+    packuswb       m0,          m0
+    movd           [r2],        m0
+    pshufd         m0,          m0, 1
+    movd           [r2 + r5],   m0
+
+    lea            r2,          [r2 + 2 * r5]
+    lea            r0,          [r0 + 2 * r3]
+    lea            r1,          [r1 + 2 * r4]
+
+    dec            r6d
+    jnz            .loop
+    RET
+%endmacro
+
+ADDAVG_W4_H4 4
+ADDAVG_W4_H4 8
+ADDAVG_W4_H4 16
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_6x8, 6,7,5, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova        m4,             [pw_16448]
+    add         r3,             r3
+    add         r4,             r4
+
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    paddw       m0,             m2
+    paddw       m0,             m4
+    psraw       m0,             7
+    packuswb    m0,             m0
+    movd        [r2],           m0
+    pextrw      [r2 + 4],       m0, 2
+
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    paddw       m1,             m3
+    paddw       m1,             m4
+    psraw       m1,             7
+    packuswb    m1,             m1
+    movd        [r2 + r5],      m1
+    pextrw      [r2 + r5 + 4],  m1, 2
+
+    lea         r2,             [r2 + 2 * r5]
+    lea         r0,             [r0 + 2 * r3]
+    lea         r1,             [r1 + 2 * r4]
+
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    paddw       m0,             m2
+    paddw       m0,             m4
+    psraw       m0,             7
+    packuswb    m0,             m0
+    movd        [r2],           m0
+    pextrw      [r2 + 4],       m0, 2
+
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    paddw       m1,             m3
+    paddw       m1,             m4
+    psraw       m1,             7
+    packuswb    m1,             m1
+    movd        [r2 + r5],      m1
+    pextrw      [r2 + r5 + 4],  m1, 2
+
+    lea         r2,             [r2 + 2 * r5]
+    lea         r0,             [r0 + 2 * r3]
+    lea         r1,             [r1 + 2 * r4]
+
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    paddw       m0,             m2
+    paddw       m0,             m4
+    psraw       m0,             7
+    packuswb    m0,             m0
+    movd        [r2],           m0
+    pextrw      [r2 + 4],       m0, 2
+
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    paddw       m1,             m3
+    paddw       m1,             m4
+    psraw       m1,             7
+    packuswb    m1,             m1
+    movd        [r2 + r5],      m1
+    pextrw      [r2 + r5 + 4],  m1, 2
+
+    lea         r2,             [r2 + 2 * r5]
+    lea         r0,             [r0 + 2 * r3]
+    lea         r1,             [r1 + 2 * r4]
+
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    paddw       m0,             m2
+    paddw       m0,             m4
+    psraw       m0,             7
+    packuswb    m0,             m0
+    movd        [r2],           m0
+    pextrw      [r2 + 4],       m0, 2
+
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    paddw       m1,             m3
+    paddw       m1,             m4
+    psraw       m1,             7
+    packuswb    m1,             m1
+    movd        [r2 + r5],      m1
+    pextrw      [r2 + r5 + 4],  m1, 2
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x2, 6,6,5, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,          [pw_16448]
+    add         r3,          r3
+    add         r4,          r4
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x6, 6,6,5, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova        m4,          [pw_16448]
+    add         r3,          r3
+    add         r4,          r4
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          7
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          7
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W8_H4 1
+INIT_XMM sse2
+cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova        m4,          [pw_256]
+    mova        m5,          [pw_128]
+    add         r3,          r3
+    add         r4,          r4
+
+    mov         r6d,         %1/4
+
+.loop
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    pmulhrsw    m0,          m4
+    paddw       m0,          m5
+
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    pmulhrsw    m1,          m4
+    paddw       m1,          m5
+
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    pmulhrsw    m0,          m4
+    paddw       m0,          m5
+
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    pmulhrsw    m1,          m4
+    paddw       m1,          m5
+
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W8_H4 4
+ADDAVG_W8_H4 8
+ADDAVG_W8_H4 16
+ADDAVG_W8_H4 32
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W12_H4 1
+INIT_XMM sse2
+cglobal addAvg_12x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova           m4,             [pw_256]
+    mova           m5,             [pw_128]
+    add            r3,             r3
+    add            r4,             r4
+
+    mov            r6d,            %1/4
+
+.loop
+    movu           m0,             [r0]
+    movu           m2,             [r1]
+    paddw          m0,             m2
+    pmulhrsw       m0,             m4
+    paddw          m0,             m5
+    packuswb       m0,             m0
+    movh           [r2],           m0
+
+    movh           m0,             [r0 + 16]
+    movh           m1,             [r0 + 16 + r3]
+    movh           m2,             [r1 + 16]
+    movh           m3,             [r1 + 16 + r4]
+
+    punpcklqdq     m0,             m1
+    punpcklqdq     m2,             m3
+
+    paddw          m0,             m2
+    pmulhrsw       m0,             m4
+    paddw          m0,             m5
+
+    packuswb       m0,             m0
+    movd           [r2 + 8],       m0
+    pshufd         m0,             m0, 1
+    movd           [r2 + 8 + r5],  m0
+
+    movu           m1,             [r0 + r3]
+    movu           m3,             [r1 + r4]
+    paddw          m1,             m3
+    pmulhrsw       m1,             m4
+    paddw          m1,             m5
+
+    packuswb       m1,             m1
+    movh           [r2 + r5],      m1
+
+    lea            r2,             [r2 + 2 * r5]
+    lea            r0,             [r0 + 2 * r3]
+    lea            r1,             [r1 + 2 * r4]
+
+    movu           m0,             [r0]
+    movu           m2,             [r1]
+    paddw          m0,             m2
+    pmulhrsw       m0,             m4
+    paddw          m0,             m5
+
+    packuswb       m0,             m0
+    movh           [r2],           m0
+
+    movh           m0,             [r0 + 16]
+    movh           m1,             [r0 + 16 + r3]
+    movh           m2,             [r1 + 16]
+    movh           m3,             [r1 + 16 + r4]
+
+    punpcklqdq     m0,             m1
+    punpcklqdq     m2,             m3
+
+    paddw          m0,             m2
+    pmulhrsw       m0,             m4
+    paddw          m0,             m5
+
+    packuswb       m0,             m0
+    movd           [r2 + 8],       m0
+    pshufd         m0,             m0,  1
+    movd           [r2 + 8 + r5],  m0
+
+    movu           m1,             [r0 + r3]
+    movu           m3,             [r1 + r4]
+    paddw          m1,             m3
+    pmulhrsw       m1,             m4
+    paddw          m1,             m5
+
+    packuswb       m1,             m1
+    movh           [r2 + r5],      m1
+
+    lea            r2,             [r2 + 2 * r5]
+    lea            r0,             [r0 + 2 * r3]
+    lea            r1,             [r1 + 2 * r4]
+
+    dec            r6d
+    jnz            .loop
+    RET
+%endmacro
+
+ADDAVG_W12_H4 16
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W16_H4 1
+INIT_XMM sse2
+cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %1/4
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W16_H4 4
+ADDAVG_W16_H4 8
+ADDAVG_W16_H4 12
+ADDAVG_W16_H4 16
+ADDAVG_W16_H4 32
+ADDAVG_W16_H4 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W24_H2 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %2/2
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    packuswb    m0,              m0
+    movh        [r2 + 16],       m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    movu        m1,              [r0 + 32 + r3]
+    movu        m3,              [r1 + 32 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m1,              m1
+    movh        [r2 + 16 + r5],  m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W24_H2 24, 32
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W32_H2 1
+INIT_XMM sse2
+cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %1/2
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 48]
+    movu        m2,              [r1 + 48]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 16],       m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    movu        m1,              [r0 + 32 + r3]
+    movu        m3,              [r1 + 32 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 48 + r3]
+    movu        m3,              [r1 + 48 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + 16 + r5],  m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz        .loop
+    RET
+%endmacro
+
+ADDAVG_W32_H2 8
+ADDAVG_W32_H2 16
+ADDAVG_W32_H2 24
+ADDAVG_W32_H2 32
+ADDAVG_W32_H2 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W48_H2 1
+INIT_XMM sse2
+cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %1/2
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 48]
+    movu        m2,              [r1 + 48]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 16],       m0
+
+    movu        m0,              [r0 + 64]
+    movu        m2,              [r1 + 64]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 80]
+    movu        m2,              [r1 + 80]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 32],       m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    movu        m1,              [r0 + 32 + r3]
+    movu        m3,              [r1 + 32 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 48 + r3]
+    movu        m3,              [r1 + 48 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + 16 + r5],  m1
+
+    movu        m1,              [r0 + 64 + r3]
+    movu        m3,              [r1 + 64 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 80 + r3]
+    movu        m3,              [r1 + 80 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + 32 + r5],  m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W48_H2 64
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W64_H1 1
+INIT_XMM sse2
+cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %1
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 48]
+    movu        m2,              [r1 + 48]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 16],       m0
+
+    movu        m0,              [r0 + 64]
+    movu        m2,              [r1 + 64]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 80]
+    movu        m2,              [r1 + 80]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 32],       m0
+
+    movu        m0,              [r0 + 96]
+    movu        m2,              [r1 + 96]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 112]
+    movu        m2,              [r1 + 112]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 48],       m0
+
+    add         r2,              r5
+    add         r0,              r3
+    add         r1,              r4
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W64_H1 16
+ADDAVG_W64_H1 32
+ADDAVG_W64_H1 48
+ADDAVG_W64_H1 64
+;-----------------------------------------------------------------------------
+
 ;=============================================================================
 ; implicit weighted biprediction
 ;=============================================================================
diff -r 4f396ba19b47 -r a72a0900a84d source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/common/x86/pixel.h	Sat Jan 18 15:09:25 2014 +0530
@@ -166,6 +166,41 @@
 int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
 int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
 
+#define ADDAVG(func, args)  \
+    void func ## _sse2 args; \
+    void func ## _sse4 args;
+ADDAVG(x265_addAvg_2x4,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_2x8,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_4x2,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_4x4,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_4x8,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_4x16,   (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_6x8,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_8x2,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_8x4,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_8x6,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_8x8,    (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_8x16,   (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_8x32,   (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_12x16,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_16x4,   (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_16x8,   (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_16x12,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_16x16,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_16x32,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_16x64,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_24x32,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_32x8,   (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_32x16,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_32x24,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_32x32,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_32x64,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_64x16,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_64x32,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_64x48,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_64x64,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+ADDAVG(x265_addAvg_48x64,  (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t))
+
 #undef DECL_PIXELS
 #undef DECL_HEVC_SSD
 #undef DECL_X1
diff -r 4f396ba19b47 -r a72a0900a84d source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Sat Jan 18 12:01:44 2014 +0530
+++ b/source/test/pixelharness.cpp	Sat Jan 18 15:09:25 2014 +0530
@@ -763,28 +763,28 @@
     return true;
 }
 
-bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt)
-{
-    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
-    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
-
-    int j = 0;
-
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
-
-    for (int i = 0; i < ITERS; i++)
-    {
-        ref(ref_dest, STRIDE, sbuf1 + j, STRIDE, sbuf2 + j, STRIDE);
-        opt(opt_dest, STRIDE, sbuf1 + j, STRIDE, sbuf2 + j, STRIDE);
-
-        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
-            return false;
-
-        j += INCR;
-    }
-
-    return true;
+bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+    int j = 0;
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        ref(sbuf1 + j, sbuf2 + j, ref_dest, STRIDE, STRIDE, STRIDE);
+        opt(sbuf1 + j, sbuf2 + j, opt_dest, STRIDE, STRIDE, STRIDE);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        j += INCR;
+    }
+
+    return true;
 }
 
 bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
@@ -959,22 +959,22 @@
         }
     }
 
-    if (opt.luma_addAvg[part])
-    {
-        if (!check_addAvg(ref.luma_addAvg[part], opt.luma_addAvg[part]))
-        {
-            printf("luma_addAvg[%s] failed\n", lumaPartStr[part]);
-            return false;
-        }
-    }
-
-    if (opt.chroma_addAvg[part])
-    {
-        if (!check_addAvg(ref.chroma_addAvg[part], opt.chroma_addAvg[part]))
-        {
-            printf("chroma_addAvg[%s] failed\n", chromaPartStr[part]);
-            return false;
-        }
+    if (opt.luma_addAvg[part])
+    {
+        if (!check_addAvg(ref.luma_addAvg[part], opt.luma_addAvg[part]))
+        {
+            printf("luma_addAvg[%s] failed\n", lumaPartStr[part]);
+            return false;
+        }
+    }
+
+    if (opt.chroma_addAvg[part])
+    {
+        if (!check_addAvg(ref.chroma_addAvg[part], opt.chroma_addAvg[part]))
+        {
+            printf("chroma_addAvg[%s] failed\n", chromaPartStr[part]);
+            return false;
+        }
     }
 
     return true;
@@ -1300,16 +1300,16 @@
         }
     }
 
-    if (opt.luma_addAvg[part])
-    {
-        printf("luma_addAvg[%s]", lumaPartStr[part]);
-        REPORT_SPEEDUP(opt.luma_addAvg[part], ref.luma_addAvg[part], pbuf1, STRIDE, sbuf1, STRIDE, sbuf2, STRIDE);
-    }
-
-    if (opt.chroma_addAvg[part])
-    {
-        printf("chroma_addAvg[%s]", chromaPartStr[part]);
-        REPORT_SPEEDUP(opt.chroma_addAvg[part], ref.chroma_addAvg[part], pbuf1, STRIDE, sbuf1, STRIDE, sbuf2, STRIDE);
+    if (opt.luma_addAvg[part])
+    {
+        printf("luma_addAvg[%s]", lumaPartStr[part]);
+        REPORT_SPEEDUP(opt.luma_addAvg[part], ref.luma_addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
+    }
+
+    if (opt.chroma_addAvg[part])
+    {
+        printf("chroma_addAvg[%s]", chromaPartStr[part]);
+        REPORT_SPEEDUP(opt.chroma_addAvg[part], ref.chroma_addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
     }
 
 #undef HEADER



More information about the x265-devel mailing list