[x265] [PATCH] asm: code for addAvg luma and chroma all sizes

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Mon Feb 3 09:09:23 CET 2014


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1391413535 -19800
#      Mon Feb 03 13:15:35 2014 +0530
# Node ID b1471c2ff0b4fd3ff0357a3b497bcd6b462ac3c6
# Parent  7ad3e3504ea6e5f7355b21c4c7de44ad9e1c0a2a
asm: code for addAvg luma and chroma all sizes

diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp	Mon Feb 03 12:32:41 2014 +0530
+++ b/source/Lib/TLibCommon/TComYuv.cpp	Mon Feb 03 13:15:35 2014 +0530
@@ -594,7 +594,7 @@
         src1Stride = srcYuv1->m_width;
         dststride  = getStride();
 
-        primitives.luma_addAvg[part](dstY, dststride, srcY0, src0Stride, srcY1, src1Stride);
+        primitives.luma_addAvg[part](srcY0, srcY1, dstY, src0Stride, src1Stride, dststride);
     }
     if (bChroma)
     {
@@ -602,8 +602,8 @@
         src1Stride = srcYuv1->m_cwidth;
         dststride  = getCStride();
 
-        primitives.chroma[m_csp].addAvg[part](dstU, dststride, srcU0, src0Stride, srcU1, src1Stride);
-        primitives.chroma[m_csp].addAvg[part](dstV, dststride, srcV0, src0Stride, srcV1, src1Stride);
+        primitives.chroma[m_csp].addAvg[part](srcU0, srcU1, dstU, src0Stride, src1Stride, dststride);
+        primitives.chroma[m_csp].addAvg[part](srcV0, srcV1, dstV, src0Stride, src1Stride, dststride);
     }
 }
 
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/pixel.cpp	Mon Feb 03 13:15:35 2014 +0530
@@ -802,7 +802,7 @@
 }
 
 template<int bx, int by>
-void addAvg(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t src0Stride, int16_t* src1, intptr_t src1Stride)
+void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
 {
     int shiftNum, offset;
 
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/primitives.h
--- a/source/common/primitives.h	Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/primitives.h	Mon Feb 03 13:15:35 2014 +0530
@@ -184,7 +184,7 @@
 typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
 typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1);
 
-typedef void (*addAvg_t)(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t src0Stride, int16_t* src1, intptr_t src1Stride);
+typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Feb 03 13:15:35 2014 +0530
@@ -563,6 +563,62 @@
     SETUP_PIXEL_SSE_SP_DEF(64, 16, cpu); \
     SETUP_PIXEL_SSE_SP_DEF(16, 64, cpu);
 
+#define SETUP_LUMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+    p.luma_addAvg[LUMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define LUMA_ADDAVG(cpu) \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(4,  8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(4,  16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  4,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(8,  32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 4,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(16, 64, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 8,  cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(32, 64, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(48, 64, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 16, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 32, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 48, cpu); \
+    SETUP_LUMA_ADDAVG_FUNC_DEF(64, 64, cpu); \
+
+#define SETUP_CHROMA_ADDAVG_FUNC_DEF(W, H, cpu) \
+    p.chroma[X265_CSP_I420].addAvg[CHROMA_## W ## x ## H] = x265_addAvg_## W ## x ## H ## cpu;
+
+#define CHROMA_ADDAVG(cpu) \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  2,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  2,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  6,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(12, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 12, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(24, 32, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 8,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 24, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(32, 32, cpu); \
+
 #define SETUP_INTRA_ANG4(mode, fno, cpu) \
     p.intra_pred[BLOCK_4x4][mode] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
 #define SETUP_INTRA_ANG8(mode, fno, cpu) \
@@ -887,6 +943,9 @@
         p.dct[DCT_4x4] = x265_dct4_sse2;
         p.idct[IDCT_4x4] = x265_idct4_sse2;
         p.idct[IDST_4x4] = x265_idst4_sse2;
+
+        LUMA_ADDAVG(_sse2);
+        CHROMA_ADDAVG(_sse2);
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -1043,6 +1102,10 @@
         SETUP_INTRA_ANG32(33, 33, sse4);
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
+
+        p.chroma[X265_CSP_I420].addAvg[CHROMA_2x4]  = x265_addAvg_2x4_sse4;
+        p.chroma[X265_CSP_I420].addAvg[CHROMA_2x8]  = x265_addAvg_2x8_sse4;
+        p.chroma[X265_CSP_I420].addAvg[CHROMA_6x8]  = x265_addAvg_6x8_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/x86/const-a.asm	Mon Feb 03 13:15:35 2014 +0530
@@ -33,6 +33,8 @@
 const pw_1,        times 16 dw 1
 const pw_16,       times 16 dw 16
 const pw_32,       times 16 dw 32
+const pw_128,      times 16 dw 128
+const pw_256,      times 16 dw 256
 const pw_512,      times 16 dw 512
 const pw_1024,     times 16 dw 1024
 const pw_4096,     times 16 dw 4096
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/x86/mc-a.asm	Mon Feb 03 13:15:35 2014 +0530
@@ -49,6 +49,8 @@
 cextern pw_8
 cextern pw_32
 cextern pw_64
+cextern pw_128
+cextern pw_256
 cextern pw_512
 cextern pw_00ff
 cextern pw_pixel_max
@@ -56,6 +58,1036 @@
 cextern pd_32
 cextern deinterleave_shufd
 
+;====================================================================================================================
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+;====================================================================================================================
+; r0 = pSrc0,    r1 = pSrc1
+; r2 = pDst,     r3 = iStride0
+; r4 = iStride1, r5 = iDstStride
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x4, 6,6,8, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova          m0,          [pw_256]
+    mova          m7,          [pw_128]
+    add           r3,          r3
+    add           r4,          r4
+
+    movd          m1,          [r0]
+    movd          m2,          [r0 + r3]
+    movd          m3,          [r1]
+    movd          m4,          [r1 + r4]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m2,          [r0]
+    movd          m4,          [r0 + r3]
+    movd          m5,          [r1]
+    movd          m6,          [r1 + r4]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    pmulhrsw      m1,          m0
+    paddw         m1,          m7
+    packuswb      m1,          m1
+
+    pextrw        [r2],        m1, 0
+    pextrw        [r2 + r5],   m1, 1
+    lea           r2,          [r2 + 2 * r5]
+    pextrw        [r2],        m1, 2
+    pextrw        [r2 + r5],   m1, 3
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x8, 6,6,8, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova          m0,          [pw_256]
+    mova          m7,          [pw_128]
+    add           r3,          r3
+    add           r4,          r4
+
+    movd          m1,          [r0]
+    movd          m2,          [r0 + r3]
+    movd          m3,          [r1]
+    movd          m4,          [r1 + r4]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m2,          [r0]
+    movd          m4,          [r0 + r3]
+    movd          m5,          [r1]
+    movd          m6,          [r1 + r4]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    pmulhrsw      m1,          m0
+    paddw         m1,          m7
+    packuswb      m1,          m1
+
+    pextrw        [r2],        m1, 0
+    pextrw        [r2 + r5],   m1, 1
+    lea           r2,          [r2 + 2 * r5]
+    pextrw        [r2],        m1, 2
+    pextrw        [r2 + r5],   m1, 3
+
+    lea           r2,          [r2 + 2 * r5]
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m1,          [r0]
+    movd          m2,          [r0 + r3]
+    movd          m3,          [r1]
+    movd          m4,          [r1 + r4]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m2,          [r0]
+    movd          m4,          [r0 + r3]
+    movd          m5,          [r1]
+    movd          m6,          [r1 + r4]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    pmulhrsw      m1,          m0
+    paddw         m1,          m7
+    packuswb      m1,          m1
+
+    pextrw        [r2],        m1, 0
+    pextrw        [r2 + r5],   m1, 1
+    lea           r2,          [r2 + 2 * r5]
+    pextrw        [r2],        m1, 2
+    pextrw        [r2 + r5],   m1, 3
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_4x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova           m1,          [pw_256]
+    mova           m3,          [pw_128]
+    add            r3,          r3
+    add            r4,          r4
+
+    movh           m0,          [r0]
+    movhps         m0,          [r0 + r3]
+    movh           m2,          [r1]
+    movhps         m2,          [r1 + r4]
+
+    paddw          m0,          m2
+    pmulhrsw       m0,          m1
+    paddw          m0,          m3
+
+    packuswb       m0,          m0
+    movd           [r2],        m0
+    pshufd         m0,          m0, 1
+    movd           [r2 + r5],   m0
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W4_H4 1
+INIT_XMM sse2
+cglobal addAvg_4x%1, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova           m1,          [pw_256]
+    mova           m3,          [pw_128]
+    add            r3,          r3
+    add            r4,          r4
+
+    mov            r6d,         %1/4
+
+.loop
+    movh           m0,          [r0]
+    movhps         m0,          [r0 + r3]
+    movh           m2,          [r1]
+    movhps         m2,          [r1 + r4]
+
+    paddw          m0,          m2
+    pmulhrsw       m0,          m1
+    paddw          m0,          m3
+
+    packuswb       m0,          m0
+    movd           [r2],        m0
+    pshufd         m0,          m0, 1
+    movd           [r2 + r5],   m0
+
+    lea            r2,          [r2 + 2 * r5]
+    lea            r0,          [r0 + 2 * r3]
+    lea            r1,          [r1 + 2 * r4]
+
+    movh           m0,          [r0]
+    movhps         m0,          [r0 + r3]
+    movh           m2,          [r1]
+    movhps         m2,          [r1 + r4]
+
+    paddw          m0,          m2
+    pmulhrsw       m0,          m1
+    paddw          m0,          m3
+
+    packuswb       m0,          m0
+    movd           [r2],        m0
+    pshufd         m0,          m0, 1
+    movd           [r2 + r5],   m0
+
+    lea            r2,          [r2 + 2 * r5]
+    lea            r0,          [r0 + 2 * r3]
+    lea            r1,          [r1 + 2 * r4]
+
+    dec            r6d
+    jnz            .loop
+    RET
+%endmacro
+
+ADDAVG_W4_H4 4
+ADDAVG_W4_H4 8
+ADDAVG_W4_H4 16
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_6x8, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova        m4,             [pw_256]
+    mova        m5,             [pw_128]
+    add         r3,             r3
+    add         r4,             r4
+
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    paddw       m0,             m2
+    pmulhrsw    m0,             m4
+    paddw       m0,             m5
+    packuswb    m0,             m0
+    movd        [r2],           m0
+    pextrw      [r2 + 4],       m0, 2
+
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    paddw       m1,             m3
+    pmulhrsw    m1,             m4
+    paddw       m1,             m5
+    packuswb    m1,             m1
+    movd        [r2 + r5],      m1
+    pextrw      [r2 + r5 + 4],  m1, 2
+
+    lea         r2,             [r2 + 2 * r5]
+    lea         r0,             [r0 + 2 * r3]
+    lea         r1,             [r1 + 2 * r4]
+
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    paddw       m0,             m2
+    pmulhrsw    m0,             m4
+    paddw       m0,             m5
+    packuswb    m0,             m0
+    movd        [r2],           m0
+    pextrw      [r2 + 4],       m0, 2
+
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    paddw       m1,             m3
+    pmulhrsw    m1,             m4
+    paddw       m1,             m5
+    packuswb    m1,             m1
+    movd        [r2 + r5],      m1
+    pextrw      [r2 + r5 + 4],  m1, 2
+
+    lea         r2,             [r2 + 2 * r5]
+    lea         r0,             [r0 + 2 * r3]
+    lea         r1,             [r1 + 2 * r4]
+
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    paddw       m0,             m2
+    pmulhrsw    m0,             m4
+    paddw       m0,             m5
+    packuswb    m0,             m0
+    movd        [r2],           m0
+    pextrw      [r2 + 4],       m0, 2
+
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    paddw       m1,             m3
+    pmulhrsw    m1,             m4
+    paddw       m1,             m5
+    packuswb    m1,             m1
+    movd        [r2 + r5],      m1
+    pextrw      [r2 + r5 + 4],  m1, 2
+
+    lea         r2,             [r2 + 2 * r5]
+    lea         r0,             [r0 + 2 * r3]
+    lea         r1,             [r1 + 2 * r4]
+
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    paddw       m0,             m2
+    pmulhrsw    m0,             m4
+    paddw       m0,             m5
+    packuswb    m0,             m0
+    movd        [r2],           m0
+    pextrw      [r2 + 4],       m0, 2
+
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    paddw       m1,             m3
+    pmulhrsw    m1,             m4
+    paddw       m1,             m5
+    packuswb    m1,             m1
+    movd        [r2 + r5],      m1
+    pextrw      [r2 + r5 + 4],  m1, 2
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x2, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,          [pw_256]
+    mova        m5,          [pw_128]
+    add         r3,          r3
+    add         r4,          r4
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    pmulhrsw    m0,          m4
+    paddw       m0,          m5
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    pmulhrsw    m1,          m4
+    paddw       m1,          m5
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal addAvg_8x6, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova        m4,          [pw_256]
+    mova        m5,          [pw_128]
+    add         r3,          r3
+    add         r4,          r4
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    pmulhrsw    m0,          m4
+    paddw       m0,          m5
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    pmulhrsw    m1,          m4
+    paddw       m1,          m5
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    pmulhrsw    m0,          m4
+    paddw       m0,          m5
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    pmulhrsw    m1,          m4
+    paddw       m1,          m5
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    pmulhrsw    m0,          m4
+    paddw       m0,          m5
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    pmulhrsw    m1,          m4
+    paddw       m1,          m5
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    RET
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W8_H4 1
+INIT_XMM sse2
+cglobal addAvg_8x%1, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova        m4,          [pw_256]
+    mova        m5,          [pw_128]
+    add         r3,          r3
+    add         r4,          r4
+
+    mov         r6d,         %1/4
+
+.loop
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    pmulhrsw    m0,          m4
+    paddw       m0,          m5
+
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    pmulhrsw    m1,          m4
+    paddw       m1,          m5
+
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    pmulhrsw    m0,          m4
+    paddw       m0,          m5
+
+    packuswb    m0,          m0
+    movh        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    pmulhrsw    m1,          m4
+    paddw       m1,          m5
+
+    packuswb    m1,          m1
+    movh        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W8_H4 4
+ADDAVG_W8_H4 8
+ADDAVG_W8_H4 16
+ADDAVG_W8_H4 32
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W12_H4 1
+INIT_XMM sse2
+cglobal addAvg_12x%1, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova           m4,             [pw_256]
+    mova           m5,             [pw_128]
+    add            r3,             r3
+    add            r4,             r4
+
+    mov            r6d,            %1/4
+
+.loop
+    movu           m0,             [r0]
+    movu           m2,             [r1]
+    paddw          m0,             m2
+    pmulhrsw       m0,             m4
+    paddw          m0,             m5
+    packuswb       m0,             m0
+    movh           [r2],           m0
+
+    movh           m0,             [r0 + 16]
+    movhps         m0,             [r0 + 16 + r3]
+    movh           m2,             [r1 + 16]
+    movhps         m2,             [r1 + 16 + r4]
+
+    paddw          m0,             m2
+    pmulhrsw       m0,             m4
+    paddw          m0,             m5
+
+    packuswb       m0,             m0
+    movd           [r2 + 8],       m0
+    pshufd         m0,             m0, 1
+    movd           [r2 + 8 + r5],  m0
+
+    movu           m1,             [r0 + r3]
+    movu           m3,             [r1 + r4]
+    paddw          m1,             m3
+    pmulhrsw       m1,             m4
+    paddw          m1,             m5
+
+    packuswb       m1,             m1
+    movh           [r2 + r5],      m1
+
+    lea            r2,             [r2 + 2 * r5]
+    lea            r0,             [r0 + 2 * r3]
+    lea            r1,             [r1 + 2 * r4]
+
+    movu           m0,             [r0]
+    movu           m2,             [r1]
+    paddw          m0,             m2
+    pmulhrsw       m0,             m4
+    paddw          m0,             m5
+
+    packuswb       m0,             m0
+    movh           [r2],           m0
+
+    movh           m0,             [r0 + 16]
+    movhps         m0,             [r0 + 16 + r3]
+    movh           m2,             [r1 + 16]
+    movhps         m2,             [r1 + 16 + r4]
+
+    paddw          m0,             m2
+    pmulhrsw       m0,             m4
+    paddw          m0,             m5
+
+    packuswb       m0,             m0
+    movd           [r2 + 8],       m0
+    pshufd         m0,             m0,  1
+    movd           [r2 + 8 + r5],  m0
+
+    movu           m1,             [r0 + r3]
+    movu           m3,             [r1 + r4]
+    paddw          m1,             m3
+    pmulhrsw       m1,             m4
+    paddw          m1,             m5
+
+    packuswb       m1,             m1
+    movh           [r2 + r5],      m1
+
+    lea            r2,             [r2 + 2 * r5]
+    lea            r0,             [r0 + 2 * r3]
+    lea            r1,             [r1 + 2 * r4]
+
+    dec            r6d
+    jnz            .loop
+    RET
+%endmacro
+
+ADDAVG_W12_H4 16
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W16_H4 1
+INIT_XMM sse2
+cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %1/4
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W16_H4 4
+ADDAVG_W16_H4 8
+ADDAVG_W16_H4 12
+ADDAVG_W16_H4 16
+ADDAVG_W16_H4 32
+ADDAVG_W16_H4 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W24_H2 2
+INIT_XMM sse2
+cglobal addAvg_%1x%2, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %2/2
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    packuswb    m0,              m0
+    movh        [r2 + 16],       m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    movu        m1,              [r0 + 32 + r3]
+    movu        m3,              [r1 + 32 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m1,              m1
+    movh        [r2 + 16 + r5],  m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W24_H2 24, 32
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W32_H2 1
+INIT_XMM sse2
+cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %1/2
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 48]
+    movu        m2,              [r1 + 48]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 16],       m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    movu        m1,              [r0 + 32 + r3]
+    movu        m3,              [r1 + 32 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 48 + r3]
+    movu        m3,              [r1 + 48 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + 16 + r5],  m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz        .loop
+    RET
+%endmacro
+
+ADDAVG_W32_H2 8
+ADDAVG_W32_H2 16
+ADDAVG_W32_H2 24
+ADDAVG_W32_H2 32
+ADDAVG_W32_H2 64
+
+;-----------------------------------------------------------------------------
+
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W48_H2 1
+INIT_XMM sse2
+cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %1/2
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 48]
+    movu        m2,              [r1 + 48]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 16],       m0
+
+    movu        m0,              [r0 + 64]
+    movu        m2,              [r1 + 64]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 80]
+    movu        m2,              [r1 + 80]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 32],       m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + r5],       m1
+
+    movu        m1,              [r0 + 32 + r3]
+    movu        m3,              [r1 + 32 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 48 + r3]
+    movu        m3,              [r1 + 48 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + 16 + r5],  m1
+
+    movu        m1,              [r0 + 64 + r3]
+    movu        m3,              [r1 + 64 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    movu        m2,              [r0 + 80 + r3]
+    movu        m3,              [r1 + 80 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m4
+    paddw       m2,              m5
+
+    packuswb    m1,              m2
+    movu        [r2 + 32 + r5],  m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W48_H2 64
+
+;-----------------------------------------------------------------------------
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W64_H1 1
+INIT_XMM sse2
+cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    mova        m4,              [pw_256]
+    mova        m5,              [pw_128]
+    add         r3,              r3
+    add         r4,              r4
+
+    mov         r6d,             %1
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2],            m0
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 48]
+    movu        m2,              [r1 + 48]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 16],       m0
+
+    movu        m0,              [r0 + 64]
+    movu        m2,              [r1 + 64]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 80]
+    movu        m2,              [r1 + 80]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 32],       m0
+
+    movu        m0,              [r0 + 96]
+    movu        m2,              [r1 + 96]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m4
+    paddw       m0,              m5
+
+    movu        m1,              [r0 + 112]
+    movu        m2,              [r1 + 112]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m4
+    paddw       m1,              m5
+
+    packuswb    m0,              m1
+    movu        [r2 + 48],       m0
+
+    add         r2,              r5
+    add         r0,              r3
+    add         r1,              r4
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W64_H1 16
+ADDAVG_W64_H1 32
+ADDAVG_W64_H1 48
+ADDAVG_W64_H1 64
+;-----------------------------------------------------------------------------
+
 ;=============================================================================
 ; implicit weighted biprediction
 ;=============================================================================
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Mon Feb 03 12:32:41 2014 +0530
+++ b/source/common/x86/pixel.h	Mon Feb 03 13:15:35 2014 +0530
@@ -166,6 +166,41 @@
 int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
 int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
 
+#define ADDAVG(func)  \
+    void x265_ ## func ## _sse2 (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
+    void x265_ ## func ## _sse4 (int16_t*, int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
+ADDAVG(addAvg_2x4)
+ADDAVG(addAvg_2x8)
+ADDAVG(addAvg_4x2);
+ADDAVG(addAvg_4x4)
+ADDAVG(addAvg_4x8)
+ADDAVG(addAvg_4x16)
+ADDAVG(addAvg_6x8)
+ADDAVG(addAvg_8x2)
+ADDAVG(addAvg_8x4)
+ADDAVG(addAvg_8x6)
+ADDAVG(addAvg_8x8)
+ADDAVG(addAvg_8x16)
+ADDAVG(addAvg_8x32)
+ADDAVG(addAvg_12x16)
+ADDAVG(addAvg_16x4)
+ADDAVG(addAvg_16x8)
+ADDAVG(addAvg_16x12)
+ADDAVG(addAvg_16x16)
+ADDAVG(addAvg_16x32)
+ADDAVG(addAvg_16x64)
+ADDAVG(addAvg_24x32)
+ADDAVG(addAvg_32x8)
+ADDAVG(addAvg_32x16)
+ADDAVG(addAvg_32x24)
+ADDAVG(addAvg_32x32)
+ADDAVG(addAvg_32x64)
+ADDAVG(addAvg_48x64)
+ADDAVG(addAvg_64x16)
+ADDAVG(addAvg_64x32)
+ADDAVG(addAvg_64x48)
+ADDAVG(addAvg_64x64)
+
 #undef DECL_PIXELS
 #undef DECL_HEVC_SSD
 #undef DECL_X1
diff -r 7ad3e3504ea6 -r b1471c2ff0b4 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Feb 03 12:32:41 2014 +0530
+++ b/source/test/pixelharness.cpp	Mon Feb 03 13:15:35 2014 +0530
@@ -776,8 +776,8 @@
 
     for (int i = 0; i < ITERS; i++)
     {
-        ref(ref_dest, STRIDE, sbuf1 + j, STRIDE, sbuf2 + j, STRIDE);
-        opt(opt_dest, STRIDE, sbuf1 + j, STRIDE, sbuf2 + j, STRIDE);
+        ref(sbuf1 + j, sbuf2 + j, ref_dest, STRIDE, STRIDE, STRIDE);
+        opt(sbuf1 + j, sbuf2 + j, opt_dest, STRIDE, STRIDE, STRIDE);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
             return false;
@@ -1301,14 +1301,14 @@
         if (opt.chroma[i].addAvg[part])
         {
             HEADER("[%s]  add_ps[%s]", x265_source_csp_names[i], chromaPartStr[part]);
-            REPORT_SPEEDUP(opt.chroma[i].addAvg[part], ref.chroma[i].addAvg[part], pbuf1, STRIDE, sbuf1, STRIDE, sbuf2, STRIDE);
+            REPORT_SPEEDUP(opt.chroma[i].addAvg[part], ref.chroma[i].addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
         }
     }
 
     if (opt.luma_addAvg[part])
     {
         printf("luma_addAvg[%s]", lumaPartStr[part]);
-        REPORT_SPEEDUP(opt.luma_addAvg[part], ref.luma_addAvg[part], pbuf1, STRIDE, sbuf1, STRIDE, sbuf2, STRIDE);
+        REPORT_SPEEDUP(opt.luma_addAvg[part], ref.luma_addAvg[part], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
     }
 
 #undef HEADER



More information about the x265-devel mailing list