[x265] [PATCH] added C primiteve, asm code and unit test code for chroma filter

praveen at multicorewareinc.com praveen at multicorewareinc.com
Wed Oct 16 17:01:17 CEST 2013


# HG changeset patch
# User Praveen Tiwari
# Date 1381935659 -19800
# Node ID 86f1dcc53a1a101b7acd169d608b1d089efeb888
# Parent  a9b9cdf89eec614043b849068aabdf92b56fd777
added C primiteve, asm code and unit test code for chroma filter

diff -r a9b9cdf89eec -r 86f1dcc53a1a source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Wed Oct 16 20:23:47 2013 +0530
+++ b/source/common/ipfilter.cpp	Wed Oct 16 20:30:59 2013 +0530
@@ -442,13 +442,90 @@
         txt += stride;
     }
 }
+
+template<int N, int width, int height>
+void interp_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+{
+    int cStride = 1;
+    short const * coeff= g_chromaFilter[coeffIdx];
+    src -= (N / 2 - 1) * cStride;
+    coeffIdx;
+    int offset;
+    short maxVal;
+    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
+    offset =  (1 << (headRoom - 1));
+    maxVal = (1 << X265_DEPTH) - 1;
+
+    int row, col;
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col++)
+        {
+            int sum;
+
+            sum  = src[col + 0 * cStride] * coeff[0];
+            sum += src[col + 1 * cStride] * coeff[1];
+            if (N >= 4)
+            {
+                sum += src[col + 2 * cStride] * coeff[2];
+                sum += src[col + 3 * cStride] * coeff[3];
+            }
+            if (N >= 6)
+            {
+                sum += src[col + 4 * cStride] * coeff[4];
+                sum += src[col + 5 * cStride] * coeff[5];
+            }
+            if (N == 8)
+            {
+                sum += src[col + 6 * cStride] * coeff[6];
+                sum += src[col + 7 * cStride] * coeff[7];
+            }
+            short val = (short)(sum + offset) >> headRoom;
+
+            if (val < 0) val = 0;
+            if (val > maxVal) val = maxVal;
+            dst[col] = (pixel)val;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
 }
 
 namespace x265 {
 // x265 private namespace
 
+    #define SETUP_PARTITION(W, H) \
+    p.chroma_hpp[CHROMA_PARTITION_ ##W ##x ##H] = interp_horiz_pp##<4, W, H>;
+
 void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
 {
+
+    SETUP_PARTITION(2, 4);
+    SETUP_PARTITION(2, 8);
+    SETUP_PARTITION(4, 2);
+    SETUP_PARTITION(4, 4);
+    SETUP_PARTITION(4, 8);
+    SETUP_PARTITION(4, 16);
+    SETUP_PARTITION(6, 8);
+    SETUP_PARTITION(8, 2);
+    SETUP_PARTITION(8, 4);
+    SETUP_PARTITION(8, 6);
+    SETUP_PARTITION(8, 8);
+    SETUP_PARTITION(8, 16);
+    SETUP_PARTITION(8, 32);
+    SETUP_PARTITION(12, 16);
+    SETUP_PARTITION(16, 4);
+    SETUP_PARTITION(16, 8);
+    SETUP_PARTITION(16, 12);
+    SETUP_PARTITION(16, 16);
+    SETUP_PARTITION(16, 32);
+    SETUP_PARTITION(32, 8);
+    SETUP_PARTITION(32, 16);
+    SETUP_PARTITION(32, 24);
+    SETUP_PARTITION(32, 32);
+
     p.ipfilter_pp[FILTER_H_P_P_8] = filterHorizontal_p_p<8>;
     p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_p_s<8>;
     p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_p_s<8>;
diff -r a9b9cdf89eec -r 86f1dcc53a1a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Oct 16 20:23:47 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Oct 16 20:30:59 2013 +0530
@@ -91,7 +91,32 @@
 DECL_SUF( x265_pixel_avg_4x8,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
 DECL_SUF( x265_pixel_avg_4x4,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
 
-void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff);
+#define SETUP_CHROMA_FUN_DEF(W, H) \
+    void x265_interp_4tap_horiz_pp_ ##W ##x ##H ##_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+
+    SETUP_CHROMA_FUN_DEF(2, 4);
+    SETUP_CHROMA_FUN_DEF(2, 8);
+    SETUP_CHROMA_FUN_DEF(4, 2);
+    SETUP_CHROMA_FUN_DEF(4, 4);
+    SETUP_CHROMA_FUN_DEF(4, 8);
+    SETUP_CHROMA_FUN_DEF(4, 16);
+    SETUP_CHROMA_FUN_DEF(6, 8);
+    SETUP_CHROMA_FUN_DEF(8, 2);
+    SETUP_CHROMA_FUN_DEF(8, 4);
+    SETUP_CHROMA_FUN_DEF(8, 6);
+    SETUP_CHROMA_FUN_DEF(8, 8);
+    SETUP_CHROMA_FUN_DEF(8, 16);
+    SETUP_CHROMA_FUN_DEF(8, 32);
+    SETUP_CHROMA_FUN_DEF(12, 16);
+    SETUP_CHROMA_FUN_DEF(16, 4);
+    SETUP_CHROMA_FUN_DEF(16, 8);
+    SETUP_CHROMA_FUN_DEF(16, 12);
+    SETUP_CHROMA_FUN_DEF(16, 16);
+    SETUP_CHROMA_FUN_DEF(16, 32);
+    SETUP_CHROMA_FUN_DEF(32, 8);
+    SETUP_CHROMA_FUN_DEF(32, 16);
+    SETUP_CHROMA_FUN_DEF(32, 24);
+    SETUP_CHROMA_FUN_DEF(32, 32);
 }
 
 using namespace x265;
@@ -274,9 +299,32 @@
         p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse4;
         SA8D_INTER_FROM_BLOCK(sse4);
 
-#if !defined(X86_64)
-        p.ipfilter_pp[FILTER_H_P_P_4] = x265_filterHorizontal_p_p_4_sse4;
-#endif
+    #define SETUP_CHROMA_PARTITION(W, H) \
+    p.chroma_hpp[CHROMA_PARTITION_ ##W ##x ##H] = x265_interp_4tap_horiz_pp_ ##W ##x ##H ## _sse4;
+
+    SETUP_CHROMA_PARTITION(2, 4);
+    SETUP_CHROMA_PARTITION(2, 8);
+    SETUP_CHROMA_PARTITION(4, 2);
+    SETUP_CHROMA_PARTITION(4, 4);
+    SETUP_CHROMA_PARTITION(4, 8);
+    SETUP_CHROMA_PARTITION(4, 16);
+    SETUP_CHROMA_PARTITION(6, 8);
+    SETUP_CHROMA_PARTITION(8, 2);
+    SETUP_CHROMA_PARTITION(8, 4);
+    SETUP_CHROMA_PARTITION(8, 6);
+    SETUP_CHROMA_PARTITION(8, 8);
+    SETUP_CHROMA_PARTITION(8, 16);
+    SETUP_CHROMA_PARTITION(8, 32);
+    SETUP_CHROMA_PARTITION(12, 16);
+    SETUP_CHROMA_PARTITION(16, 4);
+    SETUP_CHROMA_PARTITION(16, 8);
+    SETUP_CHROMA_PARTITION(16, 12);
+    SETUP_CHROMA_PARTITION(16, 16);
+    SETUP_CHROMA_PARTITION(16, 32);
+    SETUP_CHROMA_PARTITION(32, 8);
+    SETUP_CHROMA_PARTITION(32, 16);
+    SETUP_CHROMA_PARTITION(32, 24);
+    SETUP_CHROMA_PARTITION(32, 32);
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r a9b9cdf89eec -r 86f1dcc53a1a source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Oct 16 20:23:47 2013 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Oct 16 20:30:59 2013 +0530
@@ -26,109 +26,446 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-%if ARCH_X86_64 == 0
-
 SECTION_RODATA 32
-tab_leftmask:   db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-
-tab_Tm:     db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+tab_Tm:     db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+            db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
 
 tab_c_512:  times 8 dw 512
 
+tab_coeff:    db  0, 64,  0,  0
+              db -2, 58, 10, -2
+              db -4, 54, 16, -2
+              db -6, 46, 28, -4
+              db -4, 36, 36, -4
+              db -4, 28, 46, -6
+              db -2, 16, 54, -4
+              db -2, 10, 58, -2
+
 SECTION .text
 
-%macro FILTER_H4 2
-    movu        %1, [src + col - 1]
-    pshufb      %1, Tm4
-    pmaddubsw   %1, coef2
-    movu        %2, [src + col + 1]
-    pshufb      %2, Tm4
-    pmaddubsw   %2, coef3
-    paddw       %1, %2
-    pmulhrsw    %1, c512
-    packuswb    %1, %1
+%macro FILTER_H4_w2_2 3
+    movu        %1, [srcq - 1]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    movu        %1, [srcq + srcstrideq - 1]
+    pshufb      %1, %1, Tm0
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    pmulhrsw    %2, %3
+    packuswb    %2, %2
+    pextrw      [dstq], %2, 0
+    pextrw      [dstq + dststrideq], %2, 1
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)
+; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal filterHorizontal_p_p_4, 0, 7, 8
-%define src         r0
-%define dst         r1
-%define row         r2
-%define col         r3
-%define width       r4
-%define widthleft   r5
-%define mask_offset r6
-%define coef2       m7
-%define coef3       m6
-%define Tm4         m5
-%define c512        m4
-%define x2          m3
-%define x1          m2
-%define x0          m1
-%define leftmask    m0
-%define tmp         r0
-%define tmp1        r1
- 
-    mov         tmp,        r6m
-    movd        coef2,      [tmp    ]
-    movd        coef3,      [tmp + 4]
-    pshufd      coef2,      coef2,  0
-    pshufd      coef3,      coef3,  0
-    packsswb    coef2,      coef2
-    packsswb    coef3,      coef3
+cglobal interp_4tap_horiz_pp_2x4, 4, 6, 6, src, srcstride, dst, dststride
+%define coef2       m5
+%define Tm0         m4
+%define Tm1         m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
 
-    mov         width,      r4m
-    mov         widthleft,  width
-    and         width,      ~7
-    and         widthleft,  7
-    mov         mask_offset,  widthleft
-    neg         mask_offset
+mov         r4d,        r4m
 
-    movq        leftmask,   [tab_leftmask + (7 + mask_offset)]
-    mova        Tm4,        [tab_Tm]
-    mova        c512,       [tab_c_512]
+%ifdef PIC
+lea         r5,          [tab_coeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_coeff + r4 * 4]
+%endif
 
-    mov         src,        r0m
-    mov         dst,        r2m
-    mov         row,        r5m
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+mova        Tm1,         [tab_Tm + 16]
 
-_loop_row:
-    xor         col,        col
- 
-_loop_col:
-    cmp         col,        width
-    jge         _end_col
+%rep 2
+FILTER_H4_w2_2   t0, t1, t2
+lea         srcq,       [srcq + srcstrideq * 2]
+lea         dstq,       [dstq + dststrideq * 2]
+%endrep
+
+RET
 
-    FILTER_H4   x0, x1
-    movh        [dst + col], x0
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6, src, srcstride, dst, dststride
+%define coef2       m5
+%define Tm0         m4
+%define Tm1         m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
 
-    add         col,         8
-    jmp         _loop_col
+mov         r4d,        r4m
 
-_end_col:
-    test        widthleft,  widthleft
-    jz          _next_row
+%ifdef PIC
+lea         r5,          [tab_coeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_coeff + r4 * 4]
+%endif
 
-    movq        x2, [dst + col]
-    FILTER_H4   x0, x1
-    pblendvb    x2, x2, x0, leftmask
-    movh        [dst + col], x2
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+mova        Tm1,         [tab_Tm + 16]
 
-_next_row:
-    add         src,        r1m
-    add         dst,        r3m
-    dec         row
+%rep 4
+FILTER_H4_w2_2   t0, t1, t2
+lea         srcq,       [srcq + srcstrideq * 2]
+lea         dstq,       [dstq + dststrideq * 2]
+%endrep
+
+RET
 
-    test        row,        row
-    jz          _end_row
+%macro FILTER_H4_w4_2 3
+    movu        %1, [srcq - 1]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    movu        %1, [srcq + srcstrideq - 1]
+    pshufb      %1, %1, Tm0
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    pmulhrsw    %2, %3
+    packuswb    %2, %2
+    movd        [dstq],      %2
+    pextrd      [dstq + dststrideq], %2,  1
+%endmacro
 
-    jmp         _loop_row
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x2, 4, 6, 6, src, srcstride, dst, dststride
+%define coef2       m5
+%define Tm0         m4
+%define Tm1         m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
 
-_end_row:
+mov         r4d,        r4m
 
-    RET
+%ifdef PIC
+lea         r5,          [tab_coeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_coeff + r4 * 4]
+%endif
 
-%endif  ; ARCH_X86_64 == 0
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+mova        Tm1,         [tab_Tm + 16]
+
+FILTER_H4_w4_2   t0, t1, t2
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x4, 4, 6, 6, src, srcstride, dst, dststride
+%define coef2       m5
+%define Tm0         m4
+%define Tm1         m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
+
+mov         r4d,        r4m
+
+%ifdef PIC
+lea         r5,          [tab_coeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_coeff + r4 * 4]
+%endif
+
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+mova        Tm1,         [tab_Tm + 16]
+
+%rep 2
+FILTER_H4_w4_2   t0, t1, t2
+lea         srcq,       [srcq + srcstrideq * 2]
+lea         dstq,       [dstq + dststrideq * 2]
+%endrep
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x8, 4, 6, 6, src, srcstride, dst, dststride
+%define coef2       m5
+%define Tm0         m4
+%define Tm1         m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
+
+mov         r4d,        r4m
+
+%ifdef PIC
+lea         r5,          [tab_coeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_coeff + r4 * 4]
+%endif
+
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+mova        Tm1,         [tab_Tm + 16]
+
+%rep 4
+FILTER_H4_w4_2   t0, t1, t2
+lea         srcq,       [srcq + srcstrideq * 2]
+lea         dstq,       [dstq + dststrideq * 2]
+%endrep
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x16, 4, 6, 6, src, srcstride, dst, dststride
+%define coef2       m5
+%define Tm0         m4
+%define Tm1         m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
+
+mov         r4d,        r4m
+
+%ifdef PIC
+lea         r5,          [tab_coeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_coeff + r4 * 4]
+%endif
+
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+mova        Tm1,         [tab_Tm + 16]
+
+%rep 8
+FILTER_H4_w4_2   t0, t1, t2
+lea         srcq,       [srcq + srcstrideq * 2]
+lea         dstq,       [dstq + dststrideq * 2]
+%endrep
+
+RET
+
+
+%macro FILTER_H4_w6 3
+    movu        %1, [srcq - 1]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    pmulhrsw    %2, %3
+    packuswb    %2, %2
+    movd        [dstq],      %2
+    pextrw      [dstq + 4], %2, 2
+%endmacro
+
+%macro FILTER_H4_w8 3
+    movu        %1, [srcq - 1]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    pmulhrsw    %2, %3
+    packuswb    %2, %2
+    movh        [dstq],      %2
+%endmacro
+
+%macro FILTER_H4_w12 3
+    movu        %1, [srcq - 1]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    pmulhrsw    %2, %3
+    movu        %1, [srcq - 1 + 8]
+    pshufb      %1, %1, Tm0
+    pmaddubsw   %1, coef2
+    phaddw      %1, %1
+    pmulhrsw    %1, %3
+    packuswb    %2, %1
+    movh        [dstq],      %2
+    pextrd      [dstq + 8], %2, 2
+%endmacro
+
+%macro FILTER_H4_w16 4
+    movu        %1, [srcq - 1]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    movu        %1, [srcq - 1 + 8]
+    pshufb      %4, %1, Tm0
+    pmaddubsw   %4, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %4, %1
+    pmulhrsw    %2, %3
+    pmulhrsw    %4, %3
+    packuswb    %2, %4
+    movu        [dstq],      %2
+%endmacro
+
+%macro FILTER_H4_w32 4
+    movu        %1, [srcq - 1]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    movu        %1, [srcq - 1 + 8]
+    pshufb      %4, %1, Tm0
+    pmaddubsw   %4, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %4, %1
+    pmulhrsw    %2, %3
+    pmulhrsw    %4, %3
+    packuswb    %2, %4
+    movu        [dstq],      %2
+    movu        %1, [srcq - 1 + 16]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    movu        %1, [srcq - 1 + 24]
+    pshufb      %4, %1, Tm0
+    pmaddubsw   %4, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %4, %1
+    pmulhrsw    %2, %3
+    pmulhrsw    %4, %3
+    packuswb    %2, %4
+    movu        [dstq + 16],      %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA 2
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
+%define coef2       m5
+%define Tm0         m4
+%define Tm1         m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
+
+mov         r4d,        r4m
+
+%ifdef PIC
+lea         r5,          [tab_coeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_coeff + r4 * 4]
+%endif
+
+mov           r5d,       %2
+
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+mova        Tm1,         [tab_Tm + 16]
+
+.loop
+FILTER_H4_w%1   t0, t1, t2
+add         srcq,        srcstrideq
+add         dstq,        dststrideq
+
+dec          r5d
+jnz         .loop
+
+RET
+%endmacro
+
+
+IPFILTER_CHROMA 6, 8
+IPFILTER_CHROMA 8, 2
+IPFILTER_CHROMA 8, 4
+IPFILTER_CHROMA 8, 6
+IPFILTER_CHROMA 8, 8
+IPFILTER_CHROMA 8, 16
+IPFILTER_CHROMA 8, 32
+IPFILTER_CHROMA 12, 16
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_W 2
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
+%define coef2       m6
+%define Tm0         m5
+%define Tm1         m4
+%define t3          m3
+%define t2          m2
+%define t1          m1
+%define t0          m0
+
+mov         r4d,        r4m
+
+%ifdef PIC
+lea         r5,          [tab_coeff]
+movd        coef2,       [r5 + r4 * 4]
+%else
+movd        coef2,       [tab_coeff + r4 * 4]
+%endif
+
+mov           r5d,       %2
+
+pshufd      coef2,       coef2,      0
+mova        t2,          [tab_c_512]
+mova        Tm0,         [tab_Tm]
+mova        Tm1,         [tab_Tm + 16]
+
+.loop
+FILTER_H4_w%1   t0, t1, t2, t3
+add         srcq,        srcstrideq
+add         dstq,        dststrideq
+
+dec          r5d
+jnz         .loop
+
+RET
+%endmacro
+
+IPFILTER_CHROMA_W 16, 4
+IPFILTER_CHROMA_W 16, 8
+IPFILTER_CHROMA_W 16, 12
+IPFILTER_CHROMA_W 16, 16
+IPFILTER_CHROMA_W 16, 32
+IPFILTER_CHROMA_W 32, 8
+IPFILTER_CHROMA_W 32, 16
+IPFILTER_CHROMA_W 32, 24
+IPFILTER_CHROMA_W 32, 32
diff -r a9b9cdf89eec -r 86f1dcc53a1a source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Wed Oct 16 20:23:47 2013 +0530
+++ b/source/test/ipfilterharness.cpp	Wed Oct 16 20:30:59 2013 +0530
@@ -39,6 +39,12 @@
     "ipfilterV_pp<4>"
 };
 
+const char* ChromaFilterPPNames[] =
+{
+   "2x4", "2x8", "4x2", "4x4", "4x8", "4x16", "6x8", "8x2", "8x4", "8x6", "8x8", "8x16", "8x32",
+   "12x16", "16x4", "16x8", "16x12", "16x16", "16x32", "24x32", "32x8", "32x16", "32x24", "32x32"
+};
+
 IPFilterHarness::IPFilterHarness()
 {
     ipf_t_size = 200 * 200;
@@ -262,6 +268,44 @@
     return true;
 }
 
+bool IPFilterHarness::check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt)
+{
+    int rand_srcStride, rand_dstStride, rand_coeffIdx;
+
+    for (int i = 0; i <= 100; i++)
+    {
+
+        rand_coeffIdx = rand() % 8;                // Random coeffIdex in the filter
+        rand_srcStride = rand() % 100;              // Randomly generated srcStride
+        rand_dstStride = rand() % 100;              // Randomly generated dstStride
+
+        if (rand_srcStride < 32)
+            rand_srcStride = 32;
+
+        if (rand_dstStride < 32)
+            rand_dstStride = 32;
+
+        opt(pixel_buff + 3 * rand_srcStride,
+            rand_srcStride,
+            IPF_vec_output_p,
+            rand_dstStride,
+            rand_coeffIdx
+            );
+        ref(pixel_buff + 3 * rand_srcStride,
+            rand_srcStride,
+            IPF_C_output_p,
+            rand_dstStride,
+            rand_coeffIdx
+            );
+
+        if (memcmp(IPF_vec_output_p, IPF_C_output_p, ipf_t_size))
+            return false;
+    }
+
+    return true;
+}
+
+
 bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (int value = 0; value < NUM_IPFILTER_P_P; value++)
@@ -318,6 +362,18 @@
         }
     }
 
+    for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
+    {
+        if (opt.chroma_hpp[value])
+        {
+            if (!check_IPFilterChroma_primitive(ref.chroma_hpp[value], opt.chroma_hpp[value]))
+            {
+                 printf("interp_4tap_horiz_pp[%s]", ChromaFilterPPNames[value]);
+                return false;
+            }
+        }
+    }
+
     return true;
 }
 
@@ -372,4 +428,15 @@
         REPORT_SPEEDUP(opt.ipfilter_s2p, ref.ipfilter_s2p,
                        short_buff, srcStride, IPF_vec_output_p, dstStride, width, height);
     }
+
+    for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
+    {
+        if (opt.chroma_hpp[value])
+        {
+            printf("interp_4tap_horiz_pp[%s]", ChromaFilterPPNames[value]);
+            REPORT_SPEEDUP(opt.chroma_hpp[value], ref.chroma_hpp[value],
+                           pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
+        }
+    }
+
 }
diff -r a9b9cdf89eec -r 86f1dcc53a1a source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h	Wed Oct 16 20:23:47 2013 +0530
+++ b/source/test/ipfilterharness.h	Wed Oct 16 20:30:59 2013 +0530
@@ -45,6 +45,7 @@
     bool check_IPFilter_primitive(ipfilter_sp_t ref, ipfilter_sp_t opt);
     bool check_IPFilter_primitive(ipfilter_p2s_t ref, ipfilter_p2s_t opt);
     bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
+    bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
 
 public:
 


More information about the x265-devel mailing list