[x265] [PATCH] asm-16bpp: code for addAvg luma and chroma all sizes

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Wed Feb 19 11:53:47 CET 2014


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1392807092 -19800
#      Wed Feb 19 16:21:32 2014 +0530
# Node ID cede20cde62ba0a96ac181bcf78a508097de0e7c
# Parent  6150985c3d535f0ea7a1dc0b8f3c69e65e30d25b
asm-16bpp: code for addAvg luma and chroma all sizes

diff -r 6150985c3d53 -r cede20cde62b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Feb 19 16:21:32 2014 +0530
@@ -679,10 +679,13 @@
     p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = x265_addAvg_ ## W ## x ## H ## cpu;
 
 #define CHROMA_ADDAVG(cpu) \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(2,  4,  cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(2,  8,  cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  2,  cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  8,  cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  16, cpu); \
+    SETUP_CHROMA_ADDAVG_FUNC_DEF(6,  8,  cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  2,  cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  4,  cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  6,  cpu); \
@@ -831,6 +834,9 @@
     }
     if (cpuMask & X265_CPU_SSE4)
     {
+        LUMA_ADDAVG(_sse4);
+        CHROMA_ADDAVG(_sse4);
+
         p.dct[DCT_8x8] = x265_dct8_sse4;
         p.quant = x265_quant_sse4;
         p.dequant_normal = x265_dequant_normal_sse4;
@@ -1330,10 +1336,6 @@
         SETUP_INTRA_ANG32(33, 33, sse4);
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
-
-        p.chroma[X265_CSP_I420].addAvg[CHROMA_2x4]  = x265_addAvg_2x4_sse4;
-        p.chroma[X265_CSP_I420].addAvg[CHROMA_2x8]  = x265_addAvg_2x8_sse4;
-        p.chroma[X265_CSP_I420].addAvg[CHROMA_6x8]  = x265_addAvg_6x8_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/const-a.asm	Wed Feb 19 16:21:32 2014 +0530
@@ -36,8 +36,10 @@
 const pw_128,      times 16 dw 128
 const pw_256,      times 16 dw 256
 const pw_512,      times 16 dw 512
+const pw_1023,     times 8  dw 1023
 const pw_1024,     times 16 dw 1024
 const pw_4096,     times 16 dw 4096
+const pw_16400,    times 8  dw 16400
 const pw_00ff,     times 16 dw 0x00ff
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/intrapred16.asm	Wed Feb 19 16:21:32 2014 +0530
@@ -45,7 +45,6 @@
 const c_mode32_10_0,        db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1
 
 const pw_unpackwdq, times 8 db 0,1
-const pw_1023,      times 8 dw 1023
 const pw_ang8_12,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
 const pw_ang8_13,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
 const pw_ang8_14,   db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
@@ -58,6 +57,7 @@
 
 cextern pw_1
 cextern pw_8
+cextern pw_1023
 cextern pd_16
 cextern pd_32
 cextern pw_4096
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/mc-a.asm	Wed Feb 19 16:21:32 2014 +0530
@@ -52,6 +52,9 @@
 cextern pw_128
 cextern pw_256
 cextern pw_512
+cextern pw_1023
+cextern pw_1024
+cextern pw_16400
 cextern pw_00ff
 cextern pw_pixel_max
 cextern sw_64
@@ -65,6 +68,873 @@
 ; r2 = pDst,     r3 = iStride0
 ; r4 = iStride1, r5 = iDstStride
 
+%if HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal addAvg_2x4, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova          m7,          [pw_16400]
+    mova          m0,          [pw_1023]
+    add           r3,          r3
+    add           r4,          r4
+    add           r5,          r5
+
+    movd          m1,          [r0]
+    movd          m2,          [r0 + r3]
+    movd          m3,          [r1]
+    movd          m4,          [r1 + r4]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m2,          [r0]
+    movd          m4,          [r0 + r3]
+    movd          m5,          [r1]
+    movd          m6,          [r1 + r4]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    paddw         m1,          m7
+    psraw         m1,          5
+    pxor          m6,          m6
+    pmaxsw        m1,          m6
+    pminsw        m1,          m0
+
+    movd          [r2],        m1
+    pextrd        [r2 + r5],   m1, 1
+    lea           r2,          [r2 + 2 * r5]
+    pextrd        [r2],        m1, 2
+    pextrd        [r2 + r5],   m1, 3
+
+    RET
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_2x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+
+    mova          m7,          [pw_16400]
+    mova          m0,          [pw_1023]
+
+    add           r3,          r3
+    add           r4,          r4
+    add           r5,          r5
+
+%rep 2
+    movd          m1,          [r0]
+    movd          m2,          [r0 + r3]
+    movd          m3,          [r1]
+    movd          m4,          [r1 + r4]
+
+    punpckldq     m1,          m2
+    punpckldq     m3,          m4
+
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+
+    movd          m2,          [r0]
+    movd          m4,          [r0 + r3]
+    movd          m5,          [r1]
+    movd          m6,          [r1 + r4]
+
+    punpckldq     m2,          m4
+    punpckldq     m5,          m6
+    punpcklqdq    m1,          m2
+    punpcklqdq    m3,          m5
+
+    paddw         m1,          m3
+    paddw         m1,          m7
+    psraw         m1,          5
+    pxor          m6,          m6
+    pmaxsw        m1,          m6
+    pminsw        m1,          m0
+
+    movd          [r2],        m1
+    pextrd        [r2 + r5],   m1, 1
+    lea           r2,          [r2 + 2 * r5]
+    pextrd        [r2],        m1, 2
+    pextrd        [r2 + r5],   m1, 3
+
+    lea           r0,          [r0 + 2 * r3]
+    lea           r1,          [r1 + 2 * r4]
+    lea           r2,          [r2 + 2 * r5]
+%endrep
+    RET
+
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+
+    mova           m4,          [pw_16400]
+    mova           m5,          [pw_1023]
+    pxor           m6,          m6
+    add            r3,          r3
+    add            r4,          r4
+    add            r5,          r5
+
+    movh           m0,          [r0]
+    movh           m1,          [r0 + r3]
+    movh           m2,          [r1]
+    movh           m3,          [r1 + r4]
+
+    punpcklqdq     m0,          m1
+    punpcklqdq     m2,          m3
+
+    paddw          m0,          m2
+    paddw          m0,          m4
+    psraw          m0,          5
+    pmaxsw         m1,          m6
+    pminsw         m1,          m5
+
+    movh           [r2],        m0
+    movhps         [r2 + r5],   m0
+    RET
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_6x8, 6,7,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,             [pw_16400]
+    mova        m5,             [pw_1023]
+    pxor        m6,             m6
+    add         r3,             r3
+    add         r4,             r4
+    add         r5,             r5
+
+%rep 4
+    movu        m0,             [r0]
+    movu        m2,             [r1]
+    paddw       m0,             m2
+    paddw       m0,             m4
+    psraw       m0,             5
+    pmaxsw      m0,             m6
+    pminsw      m0,             m5
+    movh        [r2],           m0
+    pextrd      [r2 + 8],       m0, 2
+
+    movu        m1,             [r0 + r3]
+    movu        m3,             [r1 + r4]
+    paddw       m1,             m3
+    paddw       m1,             m4
+    psraw       m1,             5
+    pmaxsw      m1,             m6
+    pminsw      m1,             m5
+    movh        [r2 + r5],      m1
+    pextrd      [r2 + r5 + 8],  m1, 2
+
+    lea         r2,             [r2 + 2 * r5]
+    lea         r0,             [r0 + 2 * r3]
+    lea         r1,             [r1 + 2 * r4]
+%endrep
+    RET
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_8x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,          [pw_16400]
+    mova        m5,          [pw_1023]
+    pxor        m6,          m6
+    add         r3,          r3
+    add         r4,          r4
+    add         r5,          r5
+
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          5
+    pmaxsw      m0,          m6
+    pminsw      m0,          m5
+    movu        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          5
+    pmaxsw      m1,          m6
+    pminsw      m1,          m5
+    movu        [r2 + r5],   m1
+    RET
+
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal addAvg_8x6, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,          [pw_16400]
+    mova        m5,          [pw_1023]
+    pxor        m6,          m6
+    add         r3,          r3
+    add         r4,          r4
+    add         r5,          r5
+
+%rep 3
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    paddw       m0,          m4
+    psraw       m0,          5
+    pmaxsw      m0,          m6
+    pminsw      m0,          m5
+    movu        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    paddw       m1,          m4
+    psraw       m1,          5
+    pmaxsw      m1,          m6
+    pminsw      m1,          m5
+    movu        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+%endrep
+    RET
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W4_H4 1
+INIT_XMM sse4
+cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova           m4,          [pw_512]
+    mova           m5,          [pw_1023]
+    mova           m7,          [pw_1024]
+    pxor           m6,          m6
+    add            r3,          r3
+    add            r4,          r4
+    add            r5,          r5
+
+    mov            r6d,         %1/4
+
+.loop
+%rep 2
+    movh           m0,          [r0]
+    movh           m1,          [r0 + r3]
+    movh           m2,          [r1]
+    movh           m3,          [r1 + r4]
+
+    punpcklqdq     m0,          m1
+    punpcklqdq     m2,          m3
+
+    paddw          m0,          m2
+    pmulhrsw       m0,          m7
+    paddw          m0,          m4
+
+    pmaxsw         m0,          m6
+    pminsw         m0,          m5
+
+    movh           [r2],        m0
+    movhps         [r2 + r5],   m0
+
+    lea            r2,          [r2 + 2 * r5]
+    lea            r0,          [r0 + 2 * r3]
+    lea            r1,          [r1 + 2 * r4]
+%endrep
+
+    dec            r6d
+    jnz            .loop
+    RET
+%endmacro
+
+ADDAVG_W4_H4 4
+ADDAVG_W4_H4 8
+ADDAVG_W4_H4 16
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W8_H4 1
+INIT_XMM sse4
+cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,          [pw_512]
+    mova        m5,          [pw_1023]
+    mova        m7,          [pw_1024]
+    pxor        m6,          m6
+    add         r3,          r3
+    add         r4,          r4
+    add         r5,          r5
+    mov         r6d,         %1/4
+
+.loop
+%rep 2
+    movu        m0,          [r0]
+    movu        m2,          [r1]
+    paddw       m0,          m2
+    pmulhrsw    m0,          m7
+    paddw       m0,          m4
+    pmaxsw      m0,          m6
+    pminsw      m0,          m5
+    movu        [r2],        m0
+
+    movu        m1,          [r0 + r3]
+    movu        m3,          [r1 + r4]
+    paddw       m1,          m3
+    pmulhrsw    m1,          m7
+    paddw       m1,          m4
+    pmaxsw      m1,          m6
+    pminsw      m1,          m5
+    movu        [r2 + r5],   m1
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+%endrep
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W8_H4 4
+ADDAVG_W8_H4 8
+ADDAVG_W8_H4 16
+ADDAVG_W8_H4 32
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W12_H4 1
+INIT_XMM sse4
+cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova           m4,             [pw_512]
+    mova           m5,             [pw_1023]
+    mova           m7,             [pw_1024]
+    pxor           m6,             m6
+    add            r3,             r3
+    add            r4,             r4
+    add            r5,             r5
+    mov            r6d,            %1/4
+
+.loop
+%rep 2
+    movu           m0,             [r0]
+    movu           m2,             [r1]
+    paddw          m0,             m2
+    pmulhrsw       m0,             m7
+    paddw          m0,             m4
+    pmaxsw         m0,             m6
+    pminsw         m0,             m5
+    movu           [r2],           m0
+
+    movh           m0,             [r0 + 16]
+    movh           m1,             [r0 + 16 + r3]
+    movh           m2,             [r1 + 16]
+    movh           m3,             [r1 + 16 + r4]
+
+    punpcklqdq     m0,             m1
+    punpcklqdq     m2,             m3
+
+    paddw          m0,             m2
+    pmulhrsw       m0,             m7
+    paddw          m0,             m4
+    pmaxsw         m0,             m6
+    pminsw         m0,             m5
+    movh           [r2 + 16],       m0
+    movhps         [r2 + r5 + 16],  m0
+
+    movu           m1,             [r0 + r3]
+    movu           m3,             [r1 + r4]
+    paddw          m1,             m3
+    pmulhrsw       m1,             m7
+    paddw          m1,             m4
+    pmaxsw         m1,             m6
+    pminsw         m1,             m5
+    movu           [r2 + r5],      m1
+
+    lea            r2,             [r2 + 2 * r5]
+    lea            r0,             [r0 + 2 * r3]
+    lea            r1,             [r1 + 2 * r4]
+%endrep
+    dec            r6d
+    jnz            .loop
+    RET
+%endmacro
+
+ADDAVG_W12_H4 16
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W16_H4 1
+INIT_XMM sse4
+cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m7,              [pw_1024]
+    pxor        m6,              m6
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+    mov         r6d,             %1/4
+
+.loop
+%rep 2
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 16],       m1
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + r5],       m1
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m7
+    paddw       m2,              m4
+    pmaxsw      m2,              m6
+    pminsw      m2,              m5
+    movu        [r2 + r5 + 16],  m2
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+%endrep
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W16_H4 4
+ADDAVG_W16_H4 8
+ADDAVG_W16_H4 12
+ADDAVG_W16_H4 16
+ADDAVG_W16_H4 32
+ADDAVG_W16_H4 64
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W24_H2 2
+INIT_XMM sse4
+cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m7,              [pw_1024]
+    pxor        m6,              m6
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+
+    mov         r6d,             %2/2
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 16],       m1
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2 + 32],       m0
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + r5],       m1
+
+    movu        m2,              [r0 + r3 + 16]
+    movu        m3,              [r1 + r4 + 16]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m7
+    paddw       m2,              m4
+    pmaxsw      m2,              m6
+    pminsw      m2,              m5
+    movu        [r2 + r5 + 16],  m2
+
+    movu        m1,              [r0 + r3 + 32]
+    movu        m3,              [r1 + r4 + 32]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + r5 + 32],  m1
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W24_H2 24, 32
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W32_H2 1
+INIT_XMM sse4
+cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m7,              [pw_1024]
+    pxor        m6,              m6
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+
+    mov         r6d,             %1/2
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 16],       m1
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2 + 32],       m0
+
+    movu        m1,              [r0 + 48]
+    movu        m2,              [r1 + 48]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 48],       m1
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + r5],       m1
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m7
+    paddw       m2,              m4
+    pmaxsw      m2,              m6
+    pminsw      m2,              m5
+    movu        [r2 + r5 + 16],  m2
+
+    movu        m1,              [r0 + 32 + r3]
+    movu        m3,              [r1 + 32 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + r5 + 32],  m1
+
+    movu        m2,              [r0 + 48 + r3]
+    movu        m3,              [r1 + 48 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m7
+    paddw       m2,              m4
+    pmaxsw      m2,              m6
+    pminsw      m2,              m5
+    movu        [r2 + r5 + 48],  m2
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz        .loop
+    RET
+%endmacro
+
+ADDAVG_W32_H2 8
+ADDAVG_W32_H2 16
+ADDAVG_W32_H2 24
+ADDAVG_W32_H2 32
+ADDAVG_W32_H2 64
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W48_H2 1
+INIT_XMM sse4
+cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m7,              [pw_1024]
+    pxor        m6,              m6
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+
+    mov         r6d,             %1/2
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 16],       m1
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2 + 32],       m0
+
+    movu        m1,              [r0 + 48]
+    movu        m2,              [r1 + 48]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 48],       m1
+
+    movu        m0,              [r0 + 64]
+    movu        m2,              [r1 + 64]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2 + 64],       m0
+
+    movu        m1,              [r0 + 80]
+    movu        m2,              [r1 + 80]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 80],       m1
+
+    movu        m1,              [r0 + r3]
+    movu        m3,              [r1 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + r5],       m1
+
+    movu        m2,              [r0 + 16 + r3]
+    movu        m3,              [r1 + 16 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m7
+    paddw       m2,              m4
+    pmaxsw      m2,              m6
+    pminsw      m2,              m5
+    movu        [r2 + 16 + r5],  m2
+
+    movu        m1,              [r0 + 32 + r3]
+    movu        m3,              [r1 + 32 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 32 + r5],  m1
+
+    movu        m2,              [r0 + 48 + r3]
+    movu        m3,              [r1 + 48 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m7
+    paddw       m2,              m4
+    pmaxsw      m2,              m6
+    pminsw      m2,              m5
+    movu        [r2 + 48 + r5],  m2
+
+    movu        m1,              [r0 + 64 + r3]
+    movu        m3,              [r1 + 64 + r4]
+    paddw       m1,              m3
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 64 + r5],  m1
+
+    movu        m2,              [r0 + 80 + r3]
+    movu        m3,              [r1 + 80 + r4]
+    paddw       m2,              m3
+    pmulhrsw    m2,              m7
+    paddw       m2,              m4
+    pmaxsw      m2,              m6
+    pminsw      m2,              m5
+    movu        [r2 + 80 + r5],  m2
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W48_H2 64
+
+;-----------------------------------------------------------------------------
+%macro ADDAVG_W64_H1 1
+INIT_XMM sse4
+cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    mova        m7,              [pw_1024]
+    pxor        m6,              m6
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+    mov         r6d,             %1
+
+.loop
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        m1,              [r0 + 16]
+    movu        m2,              [r1 + 16]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 16],       m1
+
+    movu        m0,              [r0 + 32]
+    movu        m2,              [r1 + 32]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2 + 32],       m0
+
+    movu        m1,              [r0 + 48]
+    movu        m2,              [r1 + 48]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 48],       m1
+
+    movu        m0,              [r0 + 64]
+    movu        m2,              [r1 + 64]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2 + 64],       m0
+
+    movu        m1,              [r0 + 80]
+    movu        m2,              [r1 + 80]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 80],       m1
+
+    movu        m0,              [r0 + 96]
+    movu        m2,              [r1 + 96]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m7
+    paddw       m0,              m4
+    pmaxsw      m0,              m6
+    pminsw      m0,              m5
+    movu        [r2 + 96],       m0
+
+    movu        m1,              [r0 + 112]
+    movu        m2,              [r1 + 112]
+    paddw       m1,              m2
+    pmulhrsw    m1,              m7
+    paddw       m1,              m4
+    pmaxsw      m1,              m6
+    pminsw      m1,              m5
+    movu        [r2 + 112],       m1
+
+    add         r2,              r5
+    add         r0,              r3
+    add         r1,              r4
+
+    dec         r6d
+    jnz         .loop
+    RET
+%endmacro
+
+ADDAVG_W64_H1 16
+ADDAVG_W64_H1 32
+ADDAVG_W64_H1 48
+ADDAVG_W64_H1 64
+;-----------------------------------------------------------------------------
+%else ; !HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_2x4, 6,6,8, src0, src1, dst, src0Stride, src1tride, dstStride
@@ -1087,6 +1957,7 @@
 ADDAVG_W64_H1 48
 ADDAVG_W64_H1 64
 ;-----------------------------------------------------------------------------
+%endif ; HIGH_BIT_DEPTH
 
 ;=============================================================================
 ; implicit weighted biprediction
diff -r 6150985c3d53 -r cede20cde62b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Feb 19 12:21:13 2014 +0530
+++ b/source/test/pixelharness.cpp	Wed Feb 19 16:21:32 2014 +0530
@@ -881,6 +881,11 @@
 
 bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt)
 {
+#if HIGH_BIT_DEPTH
+    int old_depth = X265_DEPTH;
+    X265_DEPTH = 10;
+#endif
+
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
 
@@ -897,11 +902,19 @@
         opt(short_test_buff[index1] + j, short_test_buff[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+        {
+#if HIGH_BIT_DEPTH
+    X265_DEPTH = old_depth;
+#endif
             return false;
+        }
 
         j += INCR;
     }
 
+#if HIGH_BIT_DEPTH
+    X265_DEPTH = old_depth;
+#endif
     return true;
 }
 


More information about the x265-devel mailing list