[x265] [PATCH 29 of 29] intrapred: remove deprecated intrapred angular functions

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:37 CET 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1421048129 -19800
#      Mon Jan 12 13:05:29 2015 +0530
# Node ID 9ff0b1b684eaea47b204e2bb4c7c987d3a10def0
# Parent  72b600d94c2a7a446d5dce118e54871dbc463187
intrapred: remove deprecated intrapred angular functions

removed intra_pred_ang_c, intra_pred_ang_4x4 asm code and unit test code

diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/intrapred.cpp
--- a/source/common/intrapred.cpp	Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/intrapred.cpp	Mon Jan 12 13:05:29 2015 +0530
@@ -77,111 +77,6 @@
             dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) * left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
 }
 template<int width>
-void intra_pred_ang_c(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-{
-    // Map the mode index to main prediction direction and angle
-    int k, l;
-    bool modeHor       = (dirMode < 18);
-    bool modeVer       = !modeHor;
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
-    int absAng         = abs(intraPredAngle);
-    int signAng        = intraPredAngle < 0 ? -1 : 1;
-
-    // Set bitshifts and scale the angle parameter to block size
-    static const int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
-    static const int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
-    int invAngle       = invAngTable[absAng];
-
-    absAng             = angTable[absAng];
-    intraPredAngle     = signAng * absAng;
-
-    // Do angular predictions
-    {
-        pixel* refMain;
-        pixel* refSide;
-
-        // Initialise the Main and Left reference array.
-        if (intraPredAngle < 0)
-        {
-            refMain = (modeVer ? refAbove : refLeft); // + (width - 1);
-            refSide = (modeVer ? refLeft : refAbove); // + (width - 1);
-
-            // Extend the Main reference to the left.
-            int invAngleSum    = 128; // rounding for (shift by 8)
-            for (k = -1; k > width * intraPredAngle >> 5; k--)
-            {
-                invAngleSum += invAngle;
-                refMain[k] = refSide[invAngleSum >> 8];
-            }
-        }
-        else
-        {
-            refMain = modeVer ? refAbove : refLeft;
-            refSide = modeVer ? refLeft  : refAbove;
-        }
-
-        if (intraPredAngle == 0)
-        {
-            for (k = 0; k < width; k++)
-            {
-                for (l = 0; l < width; l++)
-                    dst[k * dstStride + l] = refMain[l + 1];
-            }
-
-            if (bFilter)
-            {
-                for (k = 0; k < width; k++)
-                    dst[k * dstStride] = x265_clip((int16_t)((dst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
-            }
-        }
-        else
-        {
-            int deltaPos = 0;
-            int deltaInt;
-            int deltaFract;
-            int refMainIndex;
-
-            for (k = 0; k < width; k++)
-            {
-                deltaPos += intraPredAngle;
-                deltaInt   = deltaPos >> 5;
-                deltaFract = deltaPos & (32 - 1);
-
-                if (deltaFract)
-                {
-                    // Do linear filtering
-                    for (l = 0; l < width; l++)
-                    {
-                        refMainIndex = l + deltaInt + 1;
-                        dst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
-                    }
-                }
-                else
-                {
-                    // Just copy the integer samples
-                    for (l = 0; l < width; l++)
-                        dst[k * dstStride + l] = refMain[l + deltaInt + 1];
-                }
-            }
-        }
-
-        // Flip the block if this is the horizontal mode
-        if (modeHor)
-        {
-            for (k = 0; k < width - 1; k++)
-            {
-                for (l = k + 1; l < width; l++)
-                {
-                    pixel tmp              = dst[k * dstStride + l];
-                    dst[k * dstStride + l] = dst[l * dstStride + k];
-                    dst[l * dstStride + k] = tmp;
-                }
-            }
-        }
-    }
-}
-
-template<int width>
 void intra_pred_ang_c_new(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
 {
     int width2 = width << 1;
@@ -330,11 +225,6 @@
 
     for (int i = 2; i < NUM_INTRA_MODE; i++)
     {
-        p.intra_pred[i][BLOCK_4x4] = intra_pred_ang_c<4>;
-        p.intra_pred[i][BLOCK_8x8] = intra_pred_ang_c<8>;
-        p.intra_pred[i][BLOCK_16x16] = intra_pred_ang_c<16>;
-        p.intra_pred[i][BLOCK_32x32] = intra_pred_ang_c<32>;
-
         p.intra_pred_new[i][BLOCK_4x4] = intra_pred_ang_c_new<4>;
         p.intra_pred_new[i][BLOCK_8x8] = intra_pred_ang_c_new<8>;
         p.intra_pred_new[i][BLOCK_16x16] = intra_pred_ang_c_new<16>;
diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 12 13:05:29 2015 +0530
@@ -945,23 +945,12 @@
     SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 48, cpu); \
     SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 64, cpu);
 
-#define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
-    p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
-
 #define SETUP_INTRA_ANG_COMMON_NEW(mode, fno, cpu) \
     p.intra_pred_new[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _new_ ## cpu; \
     p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu; \
     p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
     p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
 
-#define SETUP_INTRA_ANG_HIGH(mode, fno, cpu) \
-    p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu; \
-    p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
-    p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
-
-#define SETUP_INTRA_ANG4(mode, fno, cpu) \
-    p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
-
 #define SETUP_INTRA_ANG4_NEW(mode, fno, cpu) \
     p.intra_pred_new[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _new_ ## cpu;
 
@@ -969,39 +958,14 @@
     p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
     p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
 
-#define SETUP_INTRA_ANG4_8(mode, fno, cpu) \
-    p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
-
 #define SETUP_INTRA_ANG4_8_NEW(mode, fno, cpu) \
     p.intra_pred_new[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _new_ ## cpu; \
     p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu;
 
-#define INTRA_ANG_SSSE3(cpu) \
-    SETUP_INTRA_ANG_COMMON(2, 2, cpu); \
-    SETUP_INTRA_ANG_COMMON(34, 2, cpu);
-
 #define INTRA_ANG_SSSE3_NEW(cpu) \
     SETUP_INTRA_ANG_COMMON_NEW(2, 2, cpu); \
     SETUP_INTRA_ANG_COMMON_NEW(34, 2, cpu);
 
-#define INTRA_ANG_SSE4_COMMON(cpu) \
-    SETUP_INTRA_ANG_COMMON(3,  3,  cpu); \
-    SETUP_INTRA_ANG_COMMON(4,  4,  cpu); \
-    SETUP_INTRA_ANG_COMMON(5,  5,  cpu); \
-    SETUP_INTRA_ANG_COMMON(6,  6,  cpu); \
-    SETUP_INTRA_ANG_COMMON(7,  7,  cpu); \
-    SETUP_INTRA_ANG_COMMON(8,  8,  cpu); \
-    SETUP_INTRA_ANG_COMMON(9,  9,  cpu); \
-    SETUP_INTRA_ANG_COMMON(10, 10, cpu); \
-    SETUP_INTRA_ANG_COMMON(11, 11, cpu); \
-    SETUP_INTRA_ANG_COMMON(12, 12, cpu); \
-    SETUP_INTRA_ANG_COMMON(13, 13, cpu); \
-    SETUP_INTRA_ANG_COMMON(14, 14, cpu); \
-    SETUP_INTRA_ANG_COMMON(15, 15, cpu); \
-    SETUP_INTRA_ANG_COMMON(16, 16, cpu); \
-    SETUP_INTRA_ANG_COMMON(17, 17, cpu); \
-    SETUP_INTRA_ANG_COMMON(18, 18, cpu);
-
 #define INTRA_ANG_SSE4_COMMON_NEW(cpu) \
     SETUP_INTRA_ANG_COMMON_NEW(3,  3,  cpu); \
     SETUP_INTRA_ANG_COMMON_NEW(4,  4,  cpu); \
@@ -1020,22 +984,10 @@
     SETUP_INTRA_ANG_COMMON_NEW(17, 17, cpu); \
     SETUP_INTRA_ANG_COMMON_NEW(18, 18, cpu);
 
-#define INTRA_ANG_SSE4_HIGH(cpu) \
-    SETUP_INTRA_ANG4(19, 17, cpu); \
-    SETUP_INTRA_ANG4(20, 16, cpu); \
-    SETUP_INTRA_ANG4(21, 15, cpu); \
-    SETUP_INTRA_ANG4(22, 14, cpu); \
-    SETUP_INTRA_ANG4(23, 13, cpu); \
-    SETUP_INTRA_ANG4(24, 12, cpu); \
-    SETUP_INTRA_ANG4(25, 11, cpu); \
-    SETUP_INTRA_ANG4(26, 26, cpu); \
-    SETUP_INTRA_ANG4(27, 9, cpu); \
-    SETUP_INTRA_ANG4(28, 8, cpu); \
-    SETUP_INTRA_ANG4(29, 7, cpu); \
-    SETUP_INTRA_ANG4(30, 6, cpu); \
-    SETUP_INTRA_ANG4(31, 5, cpu); \
-    SETUP_INTRA_ANG4(32, 4, cpu); \
-    SETUP_INTRA_ANG4(33, 3, cpu);
+#define SETUP_INTRA_ANG_HIGH(mode, fno, cpu) \
+    p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu; \
+    p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
+    p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
 
 #define INTRA_ANG_SSE4_HIGH_NEW(cpu) \
     SETUP_INTRA_ANG_HIGH(19, 19, cpu); \
@@ -1069,23 +1021,6 @@
     SETUP_INTRA_ANG4_NEW(32, 4, cpu); \
     SETUP_INTRA_ANG4_NEW(33, 3, cpu);
 
-#define INTRA_ANG_SSE4(cpu) \
-    SETUP_INTRA_ANG4_8(19, 17, cpu); \
-    SETUP_INTRA_ANG4_8(20, 16, cpu); \
-    SETUP_INTRA_ANG4_8(21, 15, cpu); \
-    SETUP_INTRA_ANG4_8(22, 14, cpu); \
-    SETUP_INTRA_ANG4_8(23, 13, cpu); \
-    SETUP_INTRA_ANG4_8(24, 12, cpu); \
-    SETUP_INTRA_ANG4_8(25, 11, cpu); \
-    SETUP_INTRA_ANG4_8(26, 26, cpu); \
-    SETUP_INTRA_ANG4_8(27, 9, cpu); \
-    SETUP_INTRA_ANG4_8(28, 8, cpu); \
-    SETUP_INTRA_ANG4_8(29, 7, cpu); \
-    SETUP_INTRA_ANG4_8(30, 6, cpu); \
-    SETUP_INTRA_ANG4_8(31, 5, cpu); \
-    SETUP_INTRA_ANG4_8(32, 4, cpu); \
-    SETUP_INTRA_ANG4_8(33, 3, cpu);
-
 #define INTRA_ANG_SSE4_NEW(cpu) \
     SETUP_INTRA_ANG4_8_NEW(19, 17, cpu); \
     SETUP_INTRA_ANG4_8_NEW(20, 16, cpu); \
@@ -1452,7 +1387,6 @@
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
         p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
 
-        INTRA_ANG_SSSE3(ssse3);
         INTRA_ANG_SSSE3_NEW(ssse3);
 
         p.dst4x4 = x265_dst4_ssse3;
@@ -1488,9 +1422,6 @@
 
         p.planecopy_cp = x265_upShift_8_sse4;
 
-        INTRA_ANG_SSE4_COMMON(sse4);
-        INTRA_ANG_SSE4_HIGH(sse4);
-
         INTRA_ANG_SSE4_COMMON_NEW(sse4);
         INTRA_ANG_SSE4_HIGH_NEW(sse4);
 
@@ -1671,7 +1602,6 @@
         PIXEL_AVG(ssse3);
         PIXEL_AVG_W4(ssse3);
 
-        INTRA_ANG_SSSE3(ssse3);
         INTRA_ANG_SSSE3_NEW(ssse3);
 
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
@@ -1778,9 +1708,6 @@
         p.intra_pred_new[1][BLOCK_16x16] = x265_intra_pred_dc16_new_sse4;
         p.intra_pred_new[1][BLOCK_32x32] = x265_intra_pred_dc32_new_sse4;
 
-        INTRA_ANG_SSE4_COMMON(sse4);
-        INTRA_ANG_SSE4(sse4);
-
         INTRA_ANG_SSE4_COMMON_NEW(sse4);
         INTRA_ANG_SSE4_NEW(sse4);
 
diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/x86/intrapred.h	Mon Jan 12 13:05:29 2015 +0530
@@ -37,7 +37,6 @@
 void x265_intra_pred_planar32_new_sse4(pixel* dst, intptr_t dstStride, pixel* above, int, int);
 
 #define DECL_ANG(bsize, mode, cpu) \
-    void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel * dst, intptr_t dstStride, pixel * refLeft, pixel * refAbove, int dirMode, int bFilter); \
     void x265_intra_pred_ang ## bsize ## _ ## mode ## _new_ ## cpu(pixel* dst, intptr_t dstStride, pixel* above, int dirMode, int bFilter);
 
 DECL_ANG(4, 2, ssse3);
diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Mon Jan 12 13:05:29 2015 +0530
@@ -857,426 +857,6 @@
     mov             rsp, r6
     RET
 
-;-----------------------------------------------------------------------------
-; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang4_2, 3,3,4
-    cmp         r4m,           byte 34
-    cmove       r2,            r3mp
-    add         r1,            r1
-    movu        m0,            [r2 + 4]
-    movh        [r0],          m0
-    palignr     m1,            m0, 2
-    movh        [r0 + r1],     m1
-    palignr     m2,            m0, 4
-    movh        [r0 + r1 * 2], m2
-    lea         r1,            [r1 * 3]
-    psrldq      m0,            6
-    movh        [r0 + r1],     m0
-    RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang4_3, 3,4,8
-    cmp         r4m, byte 33
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 20 * 16]
-    movu        m0, [r2 + 2]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 2       ; [x 8 7 6 5 4 3 2]
-    punpcklwd   m2, m0, m1      ; [5 4 4 3 3 2 2 1]
-    palignr     m5, m0, 4       ; [x x 8 7 6 5 4 3]
-    punpcklwd   m3, m1, m5      ; [6 5 5 4 4 3 3 2]
-    palignr     m1, m0, 6       ; [x x x 8 7 6 5 4]
-    punpcklwd   m4, m5 ,m1      ; [7 6 6 5 5 4 4 3]
-    movhlps     m0, m0          ; [x x x x 8 7 6 5]
-    punpcklwd   m5, m1, m0      ; [8 7 7 6 6 5 5 4]
-
-    mova        m0, [r3 + 6 * 16]   ; [26]
-    mova        m1, [r3]            ; [20]
-    mova        m6, [r3 - 6 * 16]   ; [14]
-    mova        m7, [r3 - 12 * 16]  ; [ 8]
-    jmp        .do_filter4x4
-
-ALIGN 16
-.do_filter4x4:
-    pmaddwd m2, m0
-    paddd   m2, [pd_16]
-    psrld   m2, 5
-
-    pmaddwd m3, m1
-    paddd   m3, [pd_16]
-    psrld   m3, 5
-    packusdw m2, m3
-
-    pmaddwd m4, m6
-    paddd   m4, [pd_16]
-    psrld   m4, 5
-
-    pmaddwd m5, m7
-    paddd   m5, [pd_16]
-    psrld   m5, 5
-    packusdw m4, m5
-
-    jz         .store
-
-    ; transpose 4x4
-    punpckhwd    m0, m2, m4
-    punpcklwd    m2, m4
-    punpckhwd    m4, m2, m0
-    punpcklwd    m2, m0
-
-.store:
-    add         r1, r1
-    movh        [r0], m2
-    movhps      [r0 + r1], m2
-    movh        [r0 + r1 * 2], m4
-    lea         r1, [r1 * 3]
-    movhps      [r0 + r1], m4
-    RET
-
-cglobal intra_pred_ang4_4, 3,4,8
-    cmp         r4m, byte 32
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 18 * 16]
-    movu        m0, [r2 + 2]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 2       ; [x 8 7 6 5 4 3 2]
-    punpcklwd   m2, m0, m1      ; [5 4 4 3 3 2 2 1]
-    palignr     m6, m0, 4       ; [x x 8 7 6 5 4 3]
-    punpcklwd   m3, m1, m6      ; [6 5 5 4 4 3 3 2]
-    mova        m4, m3
-    palignr     m7, m0, 6       ; [x x x 8 7 6 5 4]
-    punpcklwd   m5, m6, m7      ; [7 6 6 5 5 4 4 3]
-
-    mova        m0, [r3 +  3 * 16]  ; [21]
-    mova        m1, [r3 -  8 * 16]  ; [10]
-    mova        m6, [r3 + 13 * 16]  ; [31]
-    mova        m7, [r3 +  2 * 16]  ; [20]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_5, 3,4,8
-    cmp         r4m, byte 31
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 10 * 16]
-    movu        m0, [r2 + 2]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 2       ; [x 8 7 6 5 4 3 2]
-    punpcklwd   m2, m0, m1      ; [5 4 4 3 3 2 2 1]
-    palignr     m6, m0, 4       ; [x x 8 7 6 5 4 3]
-    punpcklwd   m3, m1, m6      ; [6 5 5 4 4 3 3 2]
-    mova        m4, m3
-    palignr     m7, m0, 6       ; [x x x 8 7 6 5 4]
-    punpcklwd   m5, m6, m7      ; [7 6 6 5 5 4 4 3]
-
-    mova        m0, [r3 +  7 * 16]  ; [17]
-    mova        m1, [r3 -  8 * 16]  ; [ 2]
-    mova        m6, [r3 +  9 * 16]  ; [19]
-    mova        m7, [r3 -  6 * 16]  ; [ 4]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_6, 3,4,8
-    cmp         r4m, byte 30
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 19 * 16]
-    movu        m0, [r2 + 2]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 2       ; [x 8 7 6 5 4 3 2]
-    punpcklwd   m2, m0, m1      ; [5 4 4 3 3 2 2 1]
-    mova        m3, m2
-    palignr     m6, m0, 4       ; [x x 8 7 6 5 4 3]
-    punpcklwd   m4, m1, m6      ; [6 5 5 4 4 3 3 2]
-    mova        m5, m4
-
-    mova        m0, [r3 -  6 * 16]  ; [13]
-    mova        m1, [r3 +  7 * 16]  ; [26]
-    mova        m6, [r3 - 12 * 16]  ; [ 7]
-    mova        m7, [r3 +  1 * 16]  ; [20]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_7, 3,4,8
-    cmp         r4m, byte 29
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 20 * 16]
-    movu        m0, [r2 + 2]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 2       ; [x 8 7 6 5 4 3 2]
-    punpcklwd   m2, m0, m1      ; [5 4 4 3 3 2 2 1]
-    mova        m3, m2
-    mova        m4, m2
-    palignr     m6, m0, 4       ; [x x 8 7 6 5 4 3]
-    punpcklwd   m5, m1, m6      ; [6 5 5 4 4 3 3 2]
-
-    mova        m0, [r3 - 11 * 16]  ; [ 9]
-    mova        m1, [r3 -  2 * 16]  ; [18]
-    mova        m6, [r3 +  7 * 16]  ; [27]
-    mova        m7, [r3 - 16 * 16]  ; [ 4]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_8, 3,4,8
-    cmp         r4m, byte 28
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 13 * 16]
-    movu        m0, [r2 + 2]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 2       ; [x 8 7 6 5 4 3 2]
-    punpcklwd   m2, m0, m1      ; [5 4 4 3 3 2 2 1]
-    mova        m3, m2
-    mova        m4, m2
-    mova        m5, m2
-
-    mova        m0, [r3 -  8 * 16]  ; [ 5]
-    mova        m1, [r3 -  3 * 16]  ; [10]
-    mova        m6, [r3 +  2 * 16]  ; [15]
-    mova        m7, [r3 +  7 * 16]  ; [20]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_9, 3,4,8
-    cmp         r4m, byte 27
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 4 * 16]
-    movu        m0, [r2 + 2]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 2       ; [x 8 7 6 5 4 3 2]
-    punpcklwd   m2, m0, m1      ; [5 4 4 3 3 2 2 1]
-    mova        m3, m2
-    mova        m4, m2
-    mova        m5, m2
-
-    mova        m0, [r3 -  2 * 16]  ; [ 2]
-    mova        m1, [r3 -  0 * 16]  ; [ 4]
-    mova        m6, [r3 +  2 * 16]  ; [ 6]
-    mova        m7, [r3 +  4 * 16]  ; [ 8]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_10, 3,3,4
-    movh        m0,             [r2 + 2]            ; [4 3 2 1]
-    pshufb      m2,             m0, [pb_unpackwq2]  ; [4 4 4 4 3 3 3 3]
-    pshufb      m0,             [pb_unpackwq1]      ; [2 2 2 2 1 1 1 1]
-    add         r1,             r1
-    movhlps     m1,             m0                  ; [2 2 2 2]
-    movhlps     m3,             m2                  ; [4 4 4 4]
-    movh        [r0 + r1],      m1
-    movh        [r0 + r1 * 2],  m2
-    lea         r1,             [r1 * 3]
-    movh        [r0 + r1],      m3
-
-    cmp         r5m,            byte 0
-    jz         .quit
-
-    ; filter
-    mov         r2,             r3mp
-    movu        m1,             [r2]                ; [7 6 5 4 3 2 1 0]
-    pshufb      m2,             m1, [pb_unpackwq1]  ; [0 0 0 0]
-    palignr     m1,             m1, 2               ; [4 3 2 1]
-    psubw       m1,             m2
-    psraw       m1,             1
-    paddw       m0,             m1
-    pxor        m1,             m1
-    pmaxsw      m0,             m1
-    pminsw      m0,             [pw_1023]
-
-.quit:
-    movh        [r0],           m0
-    RET
-
-cglobal intra_pred_ang4_26, 4,4,3
-    movh        m0,             [r3 + 2]            ; [8 7 6 5 4 3 2 1]
-    add         r1,             r1
-    ; store
-    movh        [r0],           m0
-    movh        [r0 + r1],      m0
-    movh        [r0 + r1 * 2],  m0
-    lea         r3,             [r1 * 3]
-    movh        [r0 + r3],      m0
-
-    ; filter
-    cmp         r5m,            byte 0
-    jz         .quit
-
-    pshufb      m0,             [pb_unpackwq1]      ; [2 2 2 2 1 1 1 1]
-    movu        m1,             [r2]                ; [7 6 5 4 3 2 1 0]
-    pshufb      m2,             m1, [pb_unpackwq1]  ; [0 0 0 0]
-    palignr     m1,             m1, 2               ; [4 3 2 1]
-    psubw       m1,             m2
-    psraw       m1,             1
-    paddw       m0,             m1
-    pxor        m1,             m1
-    pmaxsw      m0,             m1
-    pminsw      m0,             [pw_1023]
-
-    pextrw      [r0],           m0, 0
-    pextrw      [r0 + r1],      m0, 1
-    pextrw      [r0 + r1 * 2],  m0, 2
-    pextrw      [r0 + r3],      m0, 3
-
-.quit:
-    RET
-
-cglobal intra_pred_ang4_11, 3,4,8
-    cmp         r4m, byte 25
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 24 * 16]
-    movu        m2, [r2]        ; [x x x 4 3 2 1 0]
-    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
-    punpcklwd   m2, m1          ; [4 3 3 2 2 1 1 0]
-    mova        m3, m2
-    mova        m4, m2
-    mova        m5, m2
-
-    mova        m0, [r3 +  6 * 16]  ; [24]
-    mova        m1, [r3 +  4 * 16]  ; [26]
-    mova        m6, [r3 +  2 * 16]  ; [28]
-    mova        m7, [r3 +  0 * 16]  ; [30]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_12, 3,4,8
-    cmp         r4m, byte 24
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 20 * 16]
-    movu        m2, [r2]        ; [x x x 4 3 2 1 0]
-    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
-    punpcklwd   m2, m1          ; [4 3 3 2 2 1 1 0]
-    mova        m3, m2
-    mova        m4, m2
-    mova        m5, m2
-
-    mova        m0, [r3 +  7 * 16]  ; [27]
-    mova        m1, [r3 +  2 * 16]  ; [22]
-    mova        m6, [r3 -  3 * 16]  ; [17]
-    mova        m7, [r3 -  8 * 16]  ; [12]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_13, 4,4,8
-    cmp         r4m, byte 23
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movu        m5, [r2 - 2]    ; [x x 4 3 2 1 0 x]
-    palignr     m2, m5, 2       ; [x x x 4 3 2 1 0]
-    palignr     m0, m5, 4       ; [x x x x 4 3 2 1]
-    pinsrw      m5, [r3 + 8], 0
-    punpcklwd   m5, m2          ; [3 2 2 1 1 0 0 x]
-    punpcklwd   m2, m0          ; [4 3 3 2 2 1 1 0]
-    mova        m3, m2
-    mova        m4, m2
-
-    lea         r3, [ang_table + 21 * 16]
-    mova        m0, [r3 +  2 * 16]  ; [23]
-    mova        m1, [r3 -  7 * 16]  ; [14]
-    mova        m6, [r3 - 16 * 16]  ; [ 5]
-    mova        m7, [r3 +  7 * 16]  ; [28]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_14, 4,4,8
-    cmp         r4m, byte 22
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movu        m5, [r2 - 2]    ; [x x 4 3 2 1 0 x]
-    palignr     m2, m5, 2       ; [x x x 4 3 2 1 0]
-    palignr     m0, m5, 4       ; [x x x x 4 3 2 1]
-    pinsrw      m5, [r3 + 4], 0
-    punpcklwd   m5, m2          ; [3 2 2 1 1 0 0 x]
-    punpcklwd   m2, m0          ; [4 3 3 2 2 1 1 0]
-    mova        m3, m2
-    mova        m4, m5
-
-    lea         r3, [ang_table + 19 * 16]
-    mova        m0, [r3 +  0 * 16]  ; [19]
-    mova        m1, [r3 - 13 * 16]  ; [ 6]
-    mova        m6, [r3 +  6 * 16]  ; [25]
-    mova        m7, [r3 -  7 * 16]  ; [12]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_15, 4,4,8
-    cmp         r4m, byte 21
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movu        m3, [r2 - 2]    ; [x x 4 3 2 1 0 x]
-    palignr     m2, m3, 2       ; [x x x 4 3 2 1 0]
-    palignr     m0, m3, 4       ; [x x x x 4 3 2 1]
-    pinsrw      m3, [r3 + 4], 0
-    pslldq      m5, m3, 2       ; [x 4 3 2 1 0 x y]
-    pinsrw      m5, [r3 + 8], 0
-    punpcklwd   m5, m3          ; [2 1 1 0 0 x x y]
-    punpcklwd   m3, m2          ; [3 2 2 1 1 0 0 x]
-    punpcklwd   m2, m0          ; [4 3 3 2 2 1 1 0]
-    mova        m4, m3
-
-    lea         r3, [ang_table + 23 * 16]
-    mova        m0, [r3 -  8 * 16]  ; [15]
-    mova        m1, [r3 +  7 * 16]  ; [30]
-    mova        m6, [r3 - 10 * 16]  ; [13]
-    mova        m7, [r3 +  5 * 16]  ; [28]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_16, 4,4,8
-    cmp         r4m, byte 20
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movu        m3, [r2 - 2]    ; [x x 4 3 2 1 0 x]
-    palignr     m2, m3, 2       ; [x x x 4 3 2 1 0]
-    palignr     m0, m3, 4       ; [x x x x 4 3 2 1]
-    pinsrw      m3, [r3 + 4], 0
-    pslldq      m5, m3, 2       ; [x 4 3 2 1 0 x y]
-    pinsrw      m5, [r3 + 6], 0
-    punpcklwd   m5, m3          ; [2 1 1 0 0 x x y]
-    punpcklwd   m3, m2          ; [3 2 2 1 1 0 0 x]
-    punpcklwd   m2, m0          ; [4 3 3 2 2 1 1 0]
-    mova        m4, m3
-
-    lea         r3, [ang_table + 19 * 16]
-    mova        m0, [r3 -  8 * 16]  ; [11]
-    mova        m1, [r3 +  3 * 16]  ; [22]
-    mova        m6, [r3 - 18 * 16]  ; [ 1]
-    mova        m7, [r3 -  7 * 16]  ; [12]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_17, 4,4,8
-    cmp         r4m, byte 19
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movu        m6, [r2 - 2]    ; [- - 4 3 2 1 0 x]
-    palignr     m2, m6, 2       ; [- - - 4 3 2 1 0]
-    palignr     m1, m6, 4       ; [- - - - 4 3 2 1]
-    mova        m4, m2
-    punpcklwd   m2, m1          ; [4 3 3 2 2 1 1 0]
-
-    pinsrw      m6, [r3 + 2], 0
-    punpcklwd   m3, m6, m4      ; [3 2 2 1 1 0 0 x]
-
-    pslldq      m4, m6, 2       ; [- 4 3 2 1 0 x y]
-    pinsrw      m4, [r3 + 4], 0
-    pslldq      m5, m4, 2       ; [4 3 2 1 0 x y z]
-    pinsrw      m5, [r3 + 8], 0
-    punpcklwd   m5, m4          ; [1 0 0 x x y y z]
-    punpcklwd   m4, m6          ; [2 1 1 0 0 x x y]
-
-    lea         r3, [ang_table + 14 * 16]
-    mova        m0, [r3 -  8 * 16]  ; [ 6]
-    mova        m1, [r3 -  2 * 16]  ; [12]
-    mova        m6, [r3 +  4 * 16]  ; [18]
-    mova        m7, [r3 + 10 * 16]  ; [24]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_18, 4,4,1
-    movh        m0, [r2]
-    pshufb      m0, [pw_swap]
-    movhps      m0, [r3 + 2]
-    add         r1, r1
-    lea         r2, [r1 * 3]
-    movh        [r0 + r2], m0
-    psrldq      m0, 2
-    movh        [r0 + r1 * 2], m0
-    psrldq      m0, 2
-    movh        [r0 + r1], m0
-    psrldq      m0, 2
-    movh        [r0], m0
-    RET
-
 ;-----------------------------------------------------------------------------------------
 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------------------
diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Mon Jan 12 13:05:29 2015 +0530
@@ -710,418 +710,6 @@
     jnz             .loop
     RET
 
-;-----------------------------------------------------------------------------
-; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang4_2, 3,3,4
-    cmp         r4m, byte 34
-    cmove       r2, r3mp
-    movh        m0, [r2 + 2]
-    movd        [r0], m0
-    palignr     m1, m0, 1
-    movd        [r0 + r1], m1
-    palignr     m2, m0, 2
-    movd        [r0 + r1 * 2], m2
-    lea         r1, [r1 * 3]
-    psrldq      m0, 3
-    movd        [r0 + r1], m0
-    RET
-
-
-INIT_XMM sse4
-cglobal intra_pred_ang4_3, 3,4,5
-    cmp         r4m, byte 33
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 20 * 16]
-    movh        m0, [r2 + 1]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
-    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    palignr     m1, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
-    palignr     m2, m0, 4       ; [x x x x x x x x 7 6 6 5 5 4 4 3]
-    palignr     m3, m0, 6       ; [x x x x x x x x 8 7 7 6 6 5 5 4]
-    punpcklqdq  m0, m1
-    punpcklqdq  m2, m3
-
-    movh        m3, [r3 + 6 * 16]   ; [26]
-    movhps      m3, [r3]            ; [20]
-    movh        m4, [r3 - 6 * 16]   ; [14]
-    movhps      m4, [r3 - 12 * 16]  ; [ 8]
-    jmp        .do_filter4x4
-
-    ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
-ALIGN 16
-.do_filter4x4:
-    mova        m1, [pw_1024]
-
-    pmaddubsw   m0, m3
-    pmulhrsw    m0, m1
-    pmaddubsw   m2, m4
-    pmulhrsw    m2, m1
-    packuswb    m0, m2
-
-    ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
-    jz         .store
-
-    ; transpose 4x4
-    pshufb      m0, [c_trans_4x4]
-
-.store:
-    ; TODO: use pextrd here after intrinsic ssse3 removed
-    movd        [r0], m0
-    pextrd      [r0 + r1], m0, 1
-    pextrd      [r0 + r1 * 2], m0, 2
-    lea         r1, [r1 * 3]
-    pextrd      [r0 + r1], m0, 3
-    RET
-
-
-cglobal intra_pred_ang4_4, 3,4,5
-    cmp         r4m, byte 32
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 18 * 16]
-    movh        m0, [r2 + 1]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
-    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    palignr     m1, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
-    palignr     m3, m0, 4       ; [x x x x x x x x 7 6 6 5 5 4 4 3]
-    punpcklqdq  m0, m1
-    punpcklqdq  m2, m1, m3
-
-    movh        m3, [r3 +  3 * 16]  ; [21]
-    movhps      m3, [r3 -  8 * 16]  ; [10]
-    movh        m4, [r3 + 13 * 16]  ; [31]
-    movhps      m4, [r3 +  2 * 16]  ; [20]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_5, 3,4,5
-    cmp         r4m, byte 31
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 10 * 16]
-    movh        m0, [r2 + 1]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
-    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    palignr     m1, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
-    palignr     m3, m0, 4       ; [x x x x x x x x 7 6 6 5 5 4 4 3]
-    punpcklqdq  m0, m1
-    punpcklqdq  m2, m1, m3
-
-    movh        m3, [r3 +  7 * 16]  ; [17]
-    movhps      m3, [r3 -  8 * 16]  ; [ 2]
-    movh        m4, [r3 +  9 * 16]  ; [19]
-    movhps      m4, [r3 -  6 * 16]  ; [ 4]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_6, 3,4,5
-    cmp         r4m, byte 30
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 19 * 16]
-    movh        m0, [r2 + 1]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
-    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    palignr     m2, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
-    punpcklqdq  m0, m0
-    punpcklqdq  m2, m2
-
-    movh        m3, [r3 -  6 * 16]  ; [13]
-    movhps      m3, [r3 +  7 * 16]  ; [26]
-    movh        m4, [r3 - 12 * 16]  ; [ 7]
-    movhps      m4, [r3 +  1 * 16]  ; [20]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_7, 3,4,5
-    cmp         r4m, byte 29
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 20 * 16]
-    movh        m0, [r2 + 1]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
-    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    palignr     m3, m0, 2       ; [x x x x x x x x 6 5 5 4 4 3 3 2]
-    punpcklqdq  m2, m0, m3
-    punpcklqdq  m0, m0
-
-    movh        m3, [r3 - 11 * 16]  ; [ 9]
-    movhps      m3, [r3 -  2 * 16]  ; [18]
-    movh        m4, [r3 +  7 * 16]  ; [27]
-    movhps      m4, [r3 - 16 * 16]  ; [ 4]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_8, 3,4,5
-    cmp         r4m, byte 28
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 13 * 16]
-    movh        m0, [r2 + 1]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
-    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    punpcklqdq  m0, m0
-    mova        m2, m0
-
-    movh        m3, [r3 -  8 * 16]  ; [ 5]
-    movhps      m3, [r3 -  3 * 16]  ; [10]
-    movh        m4, [r3 +  2 * 16]  ; [15]
-    movhps      m4, [r3 +  7 * 16]  ; [20]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_9, 3,4,5
-    cmp         r4m, byte 27
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 4 * 16]
-    movh        m0, [r2 + 1]    ; [8 7 6 5 4 3 2 1]
-    palignr     m1, m0, 1       ; [x 8 7 6 5 4 3 2]
-    punpcklbw   m0, m1          ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
-    punpcklqdq  m0, m0
-    mova        m2, m0
-
-    movh        m3, [r3 -  2 * 16]  ; [ 2]
-    movhps      m3, [r3 -  0 * 16]  ; [ 4]
-    movh        m4, [r3 +  2 * 16]  ; [ 6]
-    movhps      m4, [r3 +  4 * 16]  ; [ 8]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_10, 3,3,4
-    movd        m0, [r2 + 1]            ; [8 7 6 5 4 3 2 1]
-    pshufb      m0, [pb_unpackbd1]
-
-    pshufd      m1, m0, 1
-    movhlps     m2, m0
-    pshufd      m3, m0, 3
-    movd        [r0 + r1], m1
-    movd        [r0 + r1 * 2], m2
-    lea         r1, [r1 * 3]
-    movd        [r0 + r1], m3
-
-    cmp         r5m, byte 0
-    jz         .quit
-
-    ; filter
-    mov         r2, r3mp
-    pmovzxbw    m0, m0                  ; [-1 -1 -1 -1]
-    movh        m1, [r2]                ; [4 3 2 1 0]
-    pshufb      m2, m1, [pb_0_8]        ; [0 0 0 0]
-    pshufb      m1, [pb_unpackbw1]      ; [4 3 2 1]
-    psubw       m1, m2
-    psraw       m1, 1
-    paddw       m0, m1
-    packuswb    m0, m0
-
-.quit:
-    movd        [r0], m0
-    RET
-
-
-INIT_XMM sse4
-cglobal intra_pred_ang4_26, 4,4,3
-    movd        m0, [r3 + 1]            ; [8 7 6 5 4 3 2 1]
-
-    ; store
-    movd        [r0], m0
-    movd        [r0 + r1], m0
-    movd        [r0 + r1 * 2], m0
-    lea         r3, [r1 * 3]
-    movd        [r0 + r3], m0
-
-    ; filter
-    cmp         r5m, byte 0
-    jz         .quit
-
-    pshufb      m0, [pb_0_8]            ; [ 1  1  1  1]
-    movh        m1, [r2]                ; [-4 -3 -2 -1 0]
-    pshufb      m2, m1, [pb_0_8]        ; [0 0 0 0]
-    pshufb      m1, [pb_unpackbw1]      ; [-4 -3 -2 -1]
-    psubw       m1, m2
-    psraw       m1, 1
-    paddw       m0, m1
-    packuswb    m0, m0
-
-    pextrb      [r0], m0, 0
-    pextrb      [r0 + r1], m0, 1
-    pextrb      [r0 + r1 * 2], m0, 2
-    pextrb      [r0 + r3], m0, 3
-
-.quit:
-    RET
-
-
-cglobal intra_pred_ang4_11, 3,4,5
-    cmp         r4m, byte 25
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 24 * 16]
-    movh        m0, [r2]        ; [x x x 4 3 2 1 0]
-    palignr     m1, m0, 1       ; [x x x x 4 3 2 1]
-    punpcklbw   m0, m1          ; [x x x x x x x x 4 3 3 2 2 1 1 0]
-    punpcklqdq  m0, m0
-    mova        m2, m0
-
-    movh        m3, [r3 +  6 * 16]  ; [24]
-    movhps      m3, [r3 +  4 * 16]  ; [26]
-    movh        m4, [r3 +  2 * 16]  ; [28]
-    movhps      m4, [r3 +  0 * 16]  ; [30]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_12, 3,4,5
-    cmp         r4m, byte 24
-    cmove       r2, r3mp
-    lea         r3, [ang_table + 20 * 16]
-    movh        m0, [r2]        ; [x x x 4 3 2 1 0]
-    palignr     m1, m0, 1       ; [x x x x 4 3 2 1]
-    punpcklbw   m0, m1          ; [x x x x x x x x 4 3 3 2 2 1 1 0]
-    punpcklqdq  m0, m0
-    mova        m2, m0
-
-    movh        m3, [r3 +  7 * 16]  ; [27]
-    movhps      m3, [r3 +  2 * 16]  ; [22]
-    movh        m4, [r3 -  3 * 16]  ; [17]
-    movhps      m4, [r3 -  8 * 16]  ; [12]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_13, 4,4,5
-    cmp         r4m, byte 23
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movh        m1, [r2 - 1]    ; [x x 4 3 2 1 0 x]
-    palignr     m0, m1, 1       ; [x x x 4 3 2 1 0]
-    palignr     m2, m1, 2       ; [x x x x 4 3 2 1]
-    pinsrb      m1, [r3 + 4], 0
-    punpcklbw   m1, m0          ; [3 2 2 1 1 0 0 x]
-    punpcklbw   m0, m2          ; [4 3 3 2 2 1 1 0]
-    punpcklqdq  m2, m0, m1
-    punpcklqdq  m0, m0
-
-    lea         r3, [ang_table + 21 * 16]
-    movh        m3, [r3 +  2 * 16]  ; [23]
-    movhps      m3, [r3 -  7 * 16]  ; [14]
-    movh        m4, [r3 - 16 * 16]  ; [ 5]
-    movhps      m4, [r3 +  7 * 16]  ; [28]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_14, 4,4,5
-    cmp         r4m, byte 22
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movh        m2, [r2 - 1]    ; [x x 4 3 2 1 0 x]
-    palignr     m0, m2, 1       ; [x x x 4 3 2 1 0]
-    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
-    pinsrb      m2, [r3 + 2], 0
-    punpcklbw   m2, m0          ; [3 2 2 1 1 0 0 x]
-    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
-    punpcklqdq  m0, m0
-    punpcklqdq  m2, m2
-
-    lea         r3, [ang_table + 19 * 16]
-    movh        m3, [r3 +  0 * 16]  ; [19]
-    movhps      m3, [r3 - 13 * 16]  ; [ 6]
-    movh        m4, [r3 +  6 * 16]  ; [25]
-    movhps      m4, [r3 -  7 * 16]  ; [12]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_15, 4,4,5
-    cmp         r4m, byte 21
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movh        m2, [r2 - 1]    ; [x x 4 3 2 1 0 x]
-    palignr     m0, m2, 1       ; [x x x 4 3 2 1 0]
-    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
-    pinsrb      m2, [r3 + 2], 0
-    pslldq      m3, m2, 1       ; [x 4 3 2 1 0 x y]
-    pinsrb      m3, [r3 + 4], 0
-    punpcklbw   m4, m3, m2      ; [2 1 1 0 0 x x y]
-    punpcklbw   m2, m0          ; [3 2 2 1 1 0 0 x]
-    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
-    punpcklqdq  m0, m2
-    punpcklqdq  m2, m4
-
-    lea         r3, [ang_table + 23 * 16]
-    movh        m3, [r3 -  8 * 16]  ; [15]
-    movhps      m3, [r3 +  7 * 16]  ; [30]
-    movh        m4, [r3 - 10 * 16]  ; [13]
-    movhps      m4, [r3 +  5 * 16]  ; [28]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_16, 4,4,5
-    cmp         r4m, byte 20
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movh        m2, [r2 - 1]    ; [x x 4 3 2 1 0 x]
-    palignr     m0, m2, 1       ; [x x x 4 3 2 1 0]
-    palignr     m1, m2, 2       ; [x x x x 4 3 2 1]
-    pinsrb      m2, [r3 + 2], 0
-    pslldq      m3, m2, 1       ; [x 4 3 2 1 0 x y]
-    pinsrb      m3, [r3 + 3], 0
-    punpcklbw   m4, m3, m2      ; [2 1 1 0 0 x x y]
-    punpcklbw   m2, m0          ; [3 2 2 1 1 0 0 x]
-    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
-    punpcklqdq  m0, m2
-    punpcklqdq  m2, m4
-
-    lea         r3, [ang_table + 19 * 16]
-    movh        m3, [r3 -  8 * 16]  ; [11]
-    movhps      m3, [r3 +  3 * 16]  ; [22]
-    movh        m4, [r3 - 18 * 16]  ; [ 1]
-    movhps      m4, [r3 -  7 * 16]  ; [12]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_17, 4,4,5
-    cmp         r4m, byte 19
-    jnz        .load
-    xchg        r2, r3
-.load:
-    movh        m3, [r2 - 1]    ; [- - 4 3 2 1 0 x]
-    palignr     m0, m3, 1       ; [- - - 4 3 2 1 0]
-    palignr     m1, m3, 2       ; [- - - - 4 3 2 1]
-    mova        m4, m0
-    punpcklbw   m0, m1          ; [4 3 3 2 2 1 1 0]
-
-    pinsrb      m3, [r3 + 1], 0
-    punpcklbw   m1, m3, m4      ; [3 2 2 1 1 0 0 x]
-    punpcklqdq  m0, m1
-
-    pslldq      m2, m3, 1       ; [- 4 3 2 1 0 x y]
-    pinsrb      m2, [r3 + 2], 0
-    pslldq      m1, m2, 1       ; [4 3 2 1 0 x y z]
-    pinsrb      m1, [r3 + 4], 0
-    punpcklbw   m1, m2          ; [1 0 0 x x y y z]
-    punpcklbw   m2, m3          ; [2 1 1 0 0 x x y]
-    punpcklqdq  m2, m1
-
-    lea         r3, [ang_table + 14 * 16]
-    movh        m3, [r3 -  8 * 16]  ; [ 6]
-    movhps      m3, [r3 -  2 * 16]  ; [12]
-    movh        m4, [r3 +  4 * 16]  ; [18]
-    movhps      m4, [r3 + 10 * 16]  ; [24]
-    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_18, 4,4,1
-    mov         r2d, [r2]
-    bswap       r2d
-    movd        m0, r2d
-    pinsrd      m0, [r3 + 1], 1     ; [- 3 2 1 0 -1 -2 -3]
-    lea         r2, [r1 * 3]
-    movd        [r0 + r2], m0
-    psrldq      m0, 1
-    movd        [r0 + r1 * 2], m0
-    psrldq      m0, 1
-    movd        [r0 + r1], m0
-    psrldq      m0, 1
-    movd        [r0], m0
-    RET
-
 ;-----------------------------------------------------------------------------------------
 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------------------
diff -r 72b600d94c2a -r 9ff0b1b684ea source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp	Fri Jan 09 11:02:49 2015 +0530
+++ b/source/test/intrapredharness.cpp	Mon Jan 12 13:05:29 2015 +0530
@@ -93,50 +93,6 @@
     return true;
 }
 
-bool IntraPredHarness::check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE])
-{
-    int j = Predict::ADI_BUF_STRIDE;
-    intptr_t stride = FENC_STRIDE;
-
-#if _DEBUG
-    memset(pixel_out_vec, 0xCD, OUTPUT_SIZE);
-    memset(pixel_out_c, 0xCD, OUTPUT_SIZE);
-#endif
-
-    for (int size = 2; size <= 5; size++)
-    {
-        int width = (1 << size);
-        for (int i = 0; i <= 100; i++)
-        {
-            int bFilter = (width <= 16) && (rand() % 2);
-            for (int pmode = 2; pmode <= 34; pmode++)
-            {
-                if (!opt[pmode][size - 2])
-                    continue;
-
-                pixel * refAbove = pixel_buff + j;
-                pixel * refLeft = refAbove + 3 * width;
-                refLeft[0] = refAbove[0];
-
-                checked(opt[pmode][size - 2], pixel_out_vec, stride, refLeft, refAbove, pmode, bFilter);
-                ref[pmode][size - 2](pixel_out_c, stride, refLeft, refAbove, pmode, bFilter);
-
-                for (int k = 0; k < width; k++)
-                {
-                    if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
-                        return false;
-                }
-
-                reportfail();
-            }
-
-            j += FENC_STRIDE;
-        }
-    }
-
-    return true;
-}
-
 bool IntraPredHarness::check_angular_primitive(const intra_pred_new_t ref[][NUM_TR_SIZE], const intra_pred_new_t opt[][NUM_TR_SIZE])
 {
     int j = Predict::ADI_BUF_STRIDE;
@@ -254,12 +210,6 @@
     }
 
     // NOTE: always call since this function have check pointer in loop
-    if (!check_angular_primitive(ref.intra_pred, opt.intra_pred))
-    {
-        printf("intra_angular failed\n");
-        return false;
-    }
-
     if (!check_angular_primitive(ref.intra_pred_new, opt.intra_pred_new))
     {
         printf("intra_angular failed\n");
@@ -321,25 +271,6 @@
         for (int p = 2; p <= 34; p += 1)
         {
             int pmode = p;  //(rand()%33)+2;
-            if (opt.intra_pred[pmode][ii - 2])
-            {
-                width = (1 << ii);
-                bool bFilter = (width <= 16);
-                pixel * refAbove = pixel_buff + srcStride;
-                pixel * refLeft = refAbove + 3 * width;
-                refLeft[0] = refAbove[0];
-                printf("intra_ang%dx%d[%2d]", width, width, pmode);
-                REPORT_SPEEDUP(opt.intra_pred[pmode][ii - 2], ref.intra_pred[pmode][ii - 2],
-                               pixel_out_vec, FENC_STRIDE, refAbove, refLeft, pmode, bFilter);
-            }
-        }
-    }
-
-    for (int ii = 2; ii <= 5; ii++)
-    {
-        for (int p = 2; p <= 34; p += 1)
-        {
-            int pmode = p;  //(rand()%33)+2;
             if (opt.intra_pred_new[pmode][ii - 2])
             {
                 width = (1 << ii);
diff -r 72b600d94c2a -r 9ff0b1b684ea source/test/intrapredharness.h
--- a/source/test/intrapredharness.h	Fri Jan 09 11:02:49 2015 +0530
+++ b/source/test/intrapredharness.h	Mon Jan 12 13:05:29 2015 +0530
@@ -43,7 +43,6 @@
 
     bool check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);
     bool check_planar_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);
-    bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]);
     bool check_angular_primitive(const intra_pred_new_t ref[][NUM_TR_SIZE], const intra_pred_new_t opt[][NUM_TR_SIZE]);
     bool check_allangs_new_primitive(const intra_allangs_new_t ref[], const intra_allangs_new_t opt[]);
 



More information about the x265-devel mailing list