[x265] [PATCH 29 of 29] intrapred: remove deprecated intrapred angular functions
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:37 CET 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1421048129 -19800
# Mon Jan 12 13:05:29 2015 +0530
# Node ID 9ff0b1b684eaea47b204e2bb4c7c987d3a10def0
# Parent 72b600d94c2a7a446d5dce118e54871dbc463187
intrapred: remove deprecated intrapred angular functions
removed intra_pred_ang_c, intra_pred_ang_4x4 asm code and unit test code
diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/intrapred.cpp
--- a/source/common/intrapred.cpp Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/intrapred.cpp Mon Jan 12 13:05:29 2015 +0530
@@ -77,111 +77,6 @@
dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) * left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
}
template<int width>
-void intra_pred_ang_c(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-{
- // Map the mode index to main prediction direction and angle
- int k, l;
- bool modeHor = (dirMode < 18);
- bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
- int absAng = abs(intraPredAngle);
- int signAng = intraPredAngle < 0 ? -1 : 1;
-
- // Set bitshifts and scale the angle parameter to block size
- static const int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 };
- static const int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 }; // (256 * 32) / Angle
- int invAngle = invAngTable[absAng];
-
- absAng = angTable[absAng];
- intraPredAngle = signAng * absAng;
-
- // Do angular predictions
- {
- pixel* refMain;
- pixel* refSide;
-
- // Initialise the Main and Left reference array.
- if (intraPredAngle < 0)
- {
- refMain = (modeVer ? refAbove : refLeft); // + (width - 1);
- refSide = (modeVer ? refLeft : refAbove); // + (width - 1);
-
- // Extend the Main reference to the left.
- int invAngleSum = 128; // rounding for (shift by 8)
- for (k = -1; k > width * intraPredAngle >> 5; k--)
- {
- invAngleSum += invAngle;
- refMain[k] = refSide[invAngleSum >> 8];
- }
- }
- else
- {
- refMain = modeVer ? refAbove : refLeft;
- refSide = modeVer ? refLeft : refAbove;
- }
-
- if (intraPredAngle == 0)
- {
- for (k = 0; k < width; k++)
- {
- for (l = 0; l < width; l++)
- dst[k * dstStride + l] = refMain[l + 1];
- }
-
- if (bFilter)
- {
- for (k = 0; k < width; k++)
- dst[k * dstStride] = x265_clip((int16_t)((dst[k * dstStride]) + ((refSide[k + 1] - refSide[0]) >> 1)));
- }
- }
- else
- {
- int deltaPos = 0;
- int deltaInt;
- int deltaFract;
- int refMainIndex;
-
- for (k = 0; k < width; k++)
- {
- deltaPos += intraPredAngle;
- deltaInt = deltaPos >> 5;
- deltaFract = deltaPos & (32 - 1);
-
- if (deltaFract)
- {
- // Do linear filtering
- for (l = 0; l < width; l++)
- {
- refMainIndex = l + deltaInt + 1;
- dst[k * dstStride + l] = (pixel)(((32 - deltaFract) * refMain[refMainIndex] + deltaFract * refMain[refMainIndex + 1] + 16) >> 5);
- }
- }
- else
- {
- // Just copy the integer samples
- for (l = 0; l < width; l++)
- dst[k * dstStride + l] = refMain[l + deltaInt + 1];
- }
- }
- }
-
- // Flip the block if this is the horizontal mode
- if (modeHor)
- {
- for (k = 0; k < width - 1; k++)
- {
- for (l = k + 1; l < width; l++)
- {
- pixel tmp = dst[k * dstStride + l];
- dst[k * dstStride + l] = dst[l * dstStride + k];
- dst[l * dstStride + k] = tmp;
- }
- }
- }
- }
-}
-
-template<int width>
void intra_pred_ang_c_new(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
{
int width2 = width << 1;
@@ -330,11 +225,6 @@
for (int i = 2; i < NUM_INTRA_MODE; i++)
{
- p.intra_pred[i][BLOCK_4x4] = intra_pred_ang_c<4>;
- p.intra_pred[i][BLOCK_8x8] = intra_pred_ang_c<8>;
- p.intra_pred[i][BLOCK_16x16] = intra_pred_ang_c<16>;
- p.intra_pred[i][BLOCK_32x32] = intra_pred_ang_c<32>;
-
p.intra_pred_new[i][BLOCK_4x4] = intra_pred_ang_c_new<4>;
p.intra_pred_new[i][BLOCK_8x8] = intra_pred_ang_c_new<8>;
p.intra_pred_new[i][BLOCK_16x16] = intra_pred_ang_c_new<16>;
diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 12 13:05:29 2015 +0530
@@ -945,23 +945,12 @@
SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 48, cpu); \
SETUP_CHROMA_ADDAVG_FUNC_DEF_422(32, 64, cpu);
-#define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
- p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
-
#define SETUP_INTRA_ANG_COMMON_NEW(mode, fno, cpu) \
p.intra_pred_new[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _new_ ## cpu; \
p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu; \
p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
-#define SETUP_INTRA_ANG_HIGH(mode, fno, cpu) \
- p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu; \
- p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
- p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
-
-#define SETUP_INTRA_ANG4(mode, fno, cpu) \
- p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu;
-
#define SETUP_INTRA_ANG4_NEW(mode, fno, cpu) \
p.intra_pred_new[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _new_ ## cpu;
@@ -969,39 +958,14 @@
p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
-#define SETUP_INTRA_ANG4_8(mode, fno, cpu) \
- p.intra_pred[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _ ## cpu; \
-
#define SETUP_INTRA_ANG4_8_NEW(mode, fno, cpu) \
p.intra_pred_new[mode][BLOCK_4x4] = x265_intra_pred_ang4_ ## fno ## _new_ ## cpu; \
p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu;
-#define INTRA_ANG_SSSE3(cpu) \
- SETUP_INTRA_ANG_COMMON(2, 2, cpu); \
- SETUP_INTRA_ANG_COMMON(34, 2, cpu);
-
#define INTRA_ANG_SSSE3_NEW(cpu) \
SETUP_INTRA_ANG_COMMON_NEW(2, 2, cpu); \
SETUP_INTRA_ANG_COMMON_NEW(34, 2, cpu);
-#define INTRA_ANG_SSE4_COMMON(cpu) \
- SETUP_INTRA_ANG_COMMON(3, 3, cpu); \
- SETUP_INTRA_ANG_COMMON(4, 4, cpu); \
- SETUP_INTRA_ANG_COMMON(5, 5, cpu); \
- SETUP_INTRA_ANG_COMMON(6, 6, cpu); \
- SETUP_INTRA_ANG_COMMON(7, 7, cpu); \
- SETUP_INTRA_ANG_COMMON(8, 8, cpu); \
- SETUP_INTRA_ANG_COMMON(9, 9, cpu); \
- SETUP_INTRA_ANG_COMMON(10, 10, cpu); \
- SETUP_INTRA_ANG_COMMON(11, 11, cpu); \
- SETUP_INTRA_ANG_COMMON(12, 12, cpu); \
- SETUP_INTRA_ANG_COMMON(13, 13, cpu); \
- SETUP_INTRA_ANG_COMMON(14, 14, cpu); \
- SETUP_INTRA_ANG_COMMON(15, 15, cpu); \
- SETUP_INTRA_ANG_COMMON(16, 16, cpu); \
- SETUP_INTRA_ANG_COMMON(17, 17, cpu); \
- SETUP_INTRA_ANG_COMMON(18, 18, cpu);
-
#define INTRA_ANG_SSE4_COMMON_NEW(cpu) \
SETUP_INTRA_ANG_COMMON_NEW(3, 3, cpu); \
SETUP_INTRA_ANG_COMMON_NEW(4, 4, cpu); \
@@ -1020,22 +984,10 @@
SETUP_INTRA_ANG_COMMON_NEW(17, 17, cpu); \
SETUP_INTRA_ANG_COMMON_NEW(18, 18, cpu);
-#define INTRA_ANG_SSE4_HIGH(cpu) \
- SETUP_INTRA_ANG4(19, 17, cpu); \
- SETUP_INTRA_ANG4(20, 16, cpu); \
- SETUP_INTRA_ANG4(21, 15, cpu); \
- SETUP_INTRA_ANG4(22, 14, cpu); \
- SETUP_INTRA_ANG4(23, 13, cpu); \
- SETUP_INTRA_ANG4(24, 12, cpu); \
- SETUP_INTRA_ANG4(25, 11, cpu); \
- SETUP_INTRA_ANG4(26, 26, cpu); \
- SETUP_INTRA_ANG4(27, 9, cpu); \
- SETUP_INTRA_ANG4(28, 8, cpu); \
- SETUP_INTRA_ANG4(29, 7, cpu); \
- SETUP_INTRA_ANG4(30, 6, cpu); \
- SETUP_INTRA_ANG4(31, 5, cpu); \
- SETUP_INTRA_ANG4(32, 4, cpu); \
- SETUP_INTRA_ANG4(33, 3, cpu);
+#define SETUP_INTRA_ANG_HIGH(mode, fno, cpu) \
+ p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu; \
+ p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
+ p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
#define INTRA_ANG_SSE4_HIGH_NEW(cpu) \
SETUP_INTRA_ANG_HIGH(19, 19, cpu); \
@@ -1069,23 +1021,6 @@
SETUP_INTRA_ANG4_NEW(32, 4, cpu); \
SETUP_INTRA_ANG4_NEW(33, 3, cpu);
-#define INTRA_ANG_SSE4(cpu) \
- SETUP_INTRA_ANG4_8(19, 17, cpu); \
- SETUP_INTRA_ANG4_8(20, 16, cpu); \
- SETUP_INTRA_ANG4_8(21, 15, cpu); \
- SETUP_INTRA_ANG4_8(22, 14, cpu); \
- SETUP_INTRA_ANG4_8(23, 13, cpu); \
- SETUP_INTRA_ANG4_8(24, 12, cpu); \
- SETUP_INTRA_ANG4_8(25, 11, cpu); \
- SETUP_INTRA_ANG4_8(26, 26, cpu); \
- SETUP_INTRA_ANG4_8(27, 9, cpu); \
- SETUP_INTRA_ANG4_8(28, 8, cpu); \
- SETUP_INTRA_ANG4_8(29, 7, cpu); \
- SETUP_INTRA_ANG4_8(30, 6, cpu); \
- SETUP_INTRA_ANG4_8(31, 5, cpu); \
- SETUP_INTRA_ANG4_8(32, 4, cpu); \
- SETUP_INTRA_ANG4_8(33, 3, cpu);
-
#define INTRA_ANG_SSE4_NEW(cpu) \
SETUP_INTRA_ANG4_8_NEW(19, 17, cpu); \
SETUP_INTRA_ANG4_8_NEW(20, 16, cpu); \
@@ -1452,7 +1387,6 @@
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
- INTRA_ANG_SSSE3(ssse3);
INTRA_ANG_SSSE3_NEW(ssse3);
p.dst4x4 = x265_dst4_ssse3;
@@ -1488,9 +1422,6 @@
p.planecopy_cp = x265_upShift_8_sse4;
- INTRA_ANG_SSE4_COMMON(sse4);
- INTRA_ANG_SSE4_HIGH(sse4);
-
INTRA_ANG_SSE4_COMMON_NEW(sse4);
INTRA_ANG_SSE4_HIGH_NEW(sse4);
@@ -1671,7 +1602,6 @@
PIXEL_AVG(ssse3);
PIXEL_AVG_W4(ssse3);
- INTRA_ANG_SSSE3(ssse3);
INTRA_ANG_SSSE3_NEW(ssse3);
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
@@ -1778,9 +1708,6 @@
p.intra_pred_new[1][BLOCK_16x16] = x265_intra_pred_dc16_new_sse4;
p.intra_pred_new[1][BLOCK_32x32] = x265_intra_pred_dc32_new_sse4;
- INTRA_ANG_SSE4_COMMON(sse4);
- INTRA_ANG_SSE4(sse4);
-
INTRA_ANG_SSE4_COMMON_NEW(sse4);
INTRA_ANG_SSE4_NEW(sse4);
diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/x86/intrapred.h Mon Jan 12 13:05:29 2015 +0530
@@ -37,7 +37,6 @@
void x265_intra_pred_planar32_new_sse4(pixel* dst, intptr_t dstStride, pixel* above, int, int);
#define DECL_ANG(bsize, mode, cpu) \
- void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel * dst, intptr_t dstStride, pixel * refLeft, pixel * refAbove, int dirMode, int bFilter); \
void x265_intra_pred_ang ## bsize ## _ ## mode ## _new_ ## cpu(pixel* dst, intptr_t dstStride, pixel* above, int dirMode, int bFilter);
DECL_ANG(4, 2, ssse3);
diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/x86/intrapred16.asm Mon Jan 12 13:05:29 2015 +0530
@@ -857,426 +857,6 @@
mov rsp, r6
RET
-;-----------------------------------------------------------------------------
-; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang4_2, 3,3,4
- cmp r4m, byte 34
- cmove r2, r3mp
- add r1, r1
- movu m0, [r2 + 4]
- movh [r0], m0
- palignr m1, m0, 2
- movh [r0 + r1], m1
- palignr m2, m0, 4
- movh [r0 + r1 * 2], m2
- lea r1, [r1 * 3]
- psrldq m0, 6
- movh [r0 + r1], m0
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang4_3, 3,4,8
- cmp r4m, byte 33
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
- punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
- palignr m5, m0, 4 ; [x x 8 7 6 5 4 3]
- punpcklwd m3, m1, m5 ; [6 5 5 4 4 3 3 2]
- palignr m1, m0, 6 ; [x x x 8 7 6 5 4]
- punpcklwd m4, m5 ,m1 ; [7 6 6 5 5 4 4 3]
- movhlps m0, m0 ; [x x x x 8 7 6 5]
- punpcklwd m5, m1, m0 ; [8 7 7 6 6 5 5 4]
-
- mova m0, [r3 + 6 * 16] ; [26]
- mova m1, [r3] ; [20]
- mova m6, [r3 - 6 * 16] ; [14]
- mova m7, [r3 - 12 * 16] ; [ 8]
- jmp .do_filter4x4
-
-ALIGN 16
-.do_filter4x4:
- pmaddwd m2, m0
- paddd m2, [pd_16]
- psrld m2, 5
-
- pmaddwd m3, m1
- paddd m3, [pd_16]
- psrld m3, 5
- packusdw m2, m3
-
- pmaddwd m4, m6
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m5, m7
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m4, m5
-
- jz .store
-
- ; transpose 4x4
- punpckhwd m0, m2, m4
- punpcklwd m2, m4
- punpckhwd m4, m2, m0
- punpcklwd m2, m0
-
-.store:
- add r1, r1
- movh [r0], m2
- movhps [r0 + r1], m2
- movh [r0 + r1 * 2], m4
- lea r1, [r1 * 3]
- movhps [r0 + r1], m4
- RET
-
-cglobal intra_pred_ang4_4, 3,4,8
- cmp r4m, byte 32
- cmove r2, r3mp
- lea r3, [ang_table + 18 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
- punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
- palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
- punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2]
- mova m4, m3
- palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
- punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
-
- mova m0, [r3 + 3 * 16] ; [21]
- mova m1, [r3 - 8 * 16] ; [10]
- mova m6, [r3 + 13 * 16] ; [31]
- mova m7, [r3 + 2 * 16] ; [20]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_5, 3,4,8
- cmp r4m, byte 31
- cmove r2, r3mp
- lea r3, [ang_table + 10 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
- punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
- palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
- punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2]
- mova m4, m3
- palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
- punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
-
- mova m0, [r3 + 7 * 16] ; [17]
- mova m1, [r3 - 8 * 16] ; [ 2]
- mova m6, [r3 + 9 * 16] ; [19]
- mova m7, [r3 - 6 * 16] ; [ 4]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_6, 3,4,8
- cmp r4m, byte 30
- cmove r2, r3mp
- lea r3, [ang_table + 19 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
- punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
- mova m3, m2
- palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
- punpcklwd m4, m1, m6 ; [6 5 5 4 4 3 3 2]
- mova m5, m4
-
- mova m0, [r3 - 6 * 16] ; [13]
- mova m1, [r3 + 7 * 16] ; [26]
- mova m6, [r3 - 12 * 16] ; [ 7]
- mova m7, [r3 + 1 * 16] ; [20]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_7, 3,4,8
- cmp r4m, byte 29
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
- punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
- mova m3, m2
- mova m4, m2
- palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
- punpcklwd m5, m1, m6 ; [6 5 5 4 4 3 3 2]
-
- mova m0, [r3 - 11 * 16] ; [ 9]
- mova m1, [r3 - 2 * 16] ; [18]
- mova m6, [r3 + 7 * 16] ; [27]
- mova m7, [r3 - 16 * 16] ; [ 4]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_8, 3,4,8
- cmp r4m, byte 28
- cmove r2, r3mp
- lea r3, [ang_table + 13 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
- punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
- mova m3, m2
- mova m4, m2
- mova m5, m2
-
- mova m0, [r3 - 8 * 16] ; [ 5]
- mova m1, [r3 - 3 * 16] ; [10]
- mova m6, [r3 + 2 * 16] ; [15]
- mova m7, [r3 + 7 * 16] ; [20]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_9, 3,4,8
- cmp r4m, byte 27
- cmove r2, r3mp
- lea r3, [ang_table + 4 * 16]
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
- punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
- mova m3, m2
- mova m4, m2
- mova m5, m2
-
- mova m0, [r3 - 2 * 16] ; [ 2]
- mova m1, [r3 - 0 * 16] ; [ 4]
- mova m6, [r3 + 2 * 16] ; [ 6]
- mova m7, [r3 + 4 * 16] ; [ 8]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_10, 3,3,4
- movh m0, [r2 + 2] ; [4 3 2 1]
- pshufb m2, m0, [pb_unpackwq2] ; [4 4 4 4 3 3 3 3]
- pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
- add r1, r1
- movhlps m1, m0 ; [2 2 2 2]
- movhlps m3, m2 ; [4 4 4 4]
- movh [r0 + r1], m1
- movh [r0 + r1 * 2], m2
- lea r1, [r1 * 3]
- movh [r0 + r1], m3
-
- cmp r5m, byte 0
- jz .quit
-
- ; filter
- mov r2, r3mp
- movu m1, [r2] ; [7 6 5 4 3 2 1 0]
- pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
- palignr m1, m1, 2 ; [4 3 2 1]
- psubw m1, m2
- psraw m1, 1
- paddw m0, m1
- pxor m1, m1
- pmaxsw m0, m1
- pminsw m0, [pw_1023]
-
-.quit:
- movh [r0], m0
- RET
-
-cglobal intra_pred_ang4_26, 4,4,3
- movh m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
- add r1, r1
- ; store
- movh [r0], m0
- movh [r0 + r1], m0
- movh [r0 + r1 * 2], m0
- lea r3, [r1 * 3]
- movh [r0 + r3], m0
-
- ; filter
- cmp r5m, byte 0
- jz .quit
-
- pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
- movu m1, [r2] ; [7 6 5 4 3 2 1 0]
- pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
- palignr m1, m1, 2 ; [4 3 2 1]
- psubw m1, m2
- psraw m1, 1
- paddw m0, m1
- pxor m1, m1
- pmaxsw m0, m1
- pminsw m0, [pw_1023]
-
- pextrw [r0], m0, 0
- pextrw [r0 + r1], m0, 1
- pextrw [r0 + r1 * 2], m0, 2
- pextrw [r0 + r3], m0, 3
-
-.quit:
- RET
-
-cglobal intra_pred_ang4_11, 3,4,8
- cmp r4m, byte 25
- cmove r2, r3mp
- lea r3, [ang_table + 24 * 16]
- movu m2, [r2] ; [x x x 4 3 2 1 0]
- palignr m1, m2, 2 ; [x x x x 4 3 2 1]
- punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
- mova m3, m2
- mova m4, m2
- mova m5, m2
-
- mova m0, [r3 + 6 * 16] ; [24]
- mova m1, [r3 + 4 * 16] ; [26]
- mova m6, [r3 + 2 * 16] ; [28]
- mova m7, [r3 + 0 * 16] ; [30]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_12, 3,4,8
- cmp r4m, byte 24
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movu m2, [r2] ; [x x x 4 3 2 1 0]
- palignr m1, m2, 2 ; [x x x x 4 3 2 1]
- punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
- mova m3, m2
- mova m4, m2
- mova m5, m2
-
- mova m0, [r3 + 7 * 16] ; [27]
- mova m1, [r3 + 2 * 16] ; [22]
- mova m6, [r3 - 3 * 16] ; [17]
- mova m7, [r3 - 8 * 16] ; [12]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_13, 4,4,8
- cmp r4m, byte 23
- jnz .load
- xchg r2, r3
-.load:
- movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x]
- palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
- palignr m0, m5, 4 ; [x x x x 4 3 2 1]
- pinsrw m5, [r3 + 8], 0
- punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
- punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
- mova m3, m2
- mova m4, m2
-
- lea r3, [ang_table + 21 * 16]
- mova m0, [r3 + 2 * 16] ; [23]
- mova m1, [r3 - 7 * 16] ; [14]
- mova m6, [r3 - 16 * 16] ; [ 5]
- mova m7, [r3 + 7 * 16] ; [28]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_14, 4,4,8
- cmp r4m, byte 22
- jnz .load
- xchg r2, r3
-.load:
- movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x]
- palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
- palignr m0, m5, 4 ; [x x x x 4 3 2 1]
- pinsrw m5, [r3 + 4], 0
- punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
- punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
- mova m3, m2
- mova m4, m5
-
- lea r3, [ang_table + 19 * 16]
- mova m0, [r3 + 0 * 16] ; [19]
- mova m1, [r3 - 13 * 16] ; [ 6]
- mova m6, [r3 + 6 * 16] ; [25]
- mova m7, [r3 - 7 * 16] ; [12]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_15, 4,4,8
- cmp r4m, byte 21
- jnz .load
- xchg r2, r3
-.load:
- movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x]
- palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
- palignr m0, m3, 4 ; [x x x x 4 3 2 1]
- pinsrw m3, [r3 + 4], 0
- pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
- pinsrw m5, [r3 + 8], 0
- punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
- punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
- punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
- mova m4, m3
-
- lea r3, [ang_table + 23 * 16]
- mova m0, [r3 - 8 * 16] ; [15]
- mova m1, [r3 + 7 * 16] ; [30]
- mova m6, [r3 - 10 * 16] ; [13]
- mova m7, [r3 + 5 * 16] ; [28]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_16, 4,4,8
- cmp r4m, byte 20
- jnz .load
- xchg r2, r3
-.load:
- movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x]
- palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
- palignr m0, m3, 4 ; [x x x x 4 3 2 1]
- pinsrw m3, [r3 + 4], 0
- pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
- pinsrw m5, [r3 + 6], 0
- punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
- punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
- punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
- mova m4, m3
-
- lea r3, [ang_table + 19 * 16]
- mova m0, [r3 - 8 * 16] ; [11]
- mova m1, [r3 + 3 * 16] ; [22]
- mova m6, [r3 - 18 * 16] ; [ 1]
- mova m7, [r3 - 7 * 16] ; [12]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_17, 4,4,8
- cmp r4m, byte 19
- jnz .load
- xchg r2, r3
-.load:
- movu m6, [r2 - 2] ; [- - 4 3 2 1 0 x]
- palignr m2, m6, 2 ; [- - - 4 3 2 1 0]
- palignr m1, m6, 4 ; [- - - - 4 3 2 1]
- mova m4, m2
- punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
-
- pinsrw m6, [r3 + 2], 0
- punpcklwd m3, m6, m4 ; [3 2 2 1 1 0 0 x]
-
- pslldq m4, m6, 2 ; [- 4 3 2 1 0 x y]
- pinsrw m4, [r3 + 4], 0
- pslldq m5, m4, 2 ; [4 3 2 1 0 x y z]
- pinsrw m5, [r3 + 8], 0
- punpcklwd m5, m4 ; [1 0 0 x x y y z]
- punpcklwd m4, m6 ; [2 1 1 0 0 x x y]
-
- lea r3, [ang_table + 14 * 16]
- mova m0, [r3 - 8 * 16] ; [ 6]
- mova m1, [r3 - 2 * 16] ; [12]
- mova m6, [r3 + 4 * 16] ; [18]
- mova m7, [r3 + 10 * 16] ; [24]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_18, 4,4,1
- movh m0, [r2]
- pshufb m0, [pw_swap]
- movhps m0, [r3 + 2]
- add r1, r1
- lea r2, [r1 * 3]
- movh [r0 + r2], m0
- psrldq m0, 2
- movh [r0 + r1 * 2], m0
- psrldq m0, 2
- movh [r0 + r1], m0
- psrldq m0, 2
- movh [r0], m0
- RET
-
;-----------------------------------------------------------------------------------------
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
diff -r 72b600d94c2a -r 9ff0b1b684ea source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Fri Jan 09 11:02:49 2015 +0530
+++ b/source/common/x86/intrapred8.asm Mon Jan 12 13:05:29 2015 +0530
@@ -710,418 +710,6 @@
jnz .loop
RET
-;-----------------------------------------------------------------------------
-; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang4_2, 3,3,4
- cmp r4m, byte 34
- cmove r2, r3mp
- movh m0, [r2 + 2]
- movd [r0], m0
- palignr m1, m0, 1
- movd [r0 + r1], m1
- palignr m2, m0, 2
- movd [r0 + r1 * 2], m2
- lea r1, [r1 * 3]
- psrldq m0, 3
- movd [r0 + r1], m0
- RET
-
-
-INIT_XMM sse4
-cglobal intra_pred_ang4_3, 3,4,5
- cmp r4m, byte 33
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
- punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
- palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
- palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
- palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4]
- punpcklqdq m0, m1
- punpcklqdq m2, m3
-
- movh m3, [r3 + 6 * 16] ; [26]
- movhps m3, [r3] ; [20]
- movh m4, [r3 - 6 * 16] ; [14]
- movhps m4, [r3 - 12 * 16] ; [ 8]
- jmp .do_filter4x4
-
- ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
-ALIGN 16
-.do_filter4x4:
- mova m1, [pw_1024]
-
- pmaddubsw m0, m3
- pmulhrsw m0, m1
- pmaddubsw m2, m4
- pmulhrsw m2, m1
- packuswb m0, m2
-
- ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
- jz .store
-
- ; transpose 4x4
- pshufb m0, [c_trans_4x4]
-
-.store:
- ; TODO: use pextrd here after intrinsic ssse3 removed
- movd [r0], m0
- pextrd [r0 + r1], m0, 1
- pextrd [r0 + r1 * 2], m0, 2
- lea r1, [r1 * 3]
- pextrd [r0 + r1], m0, 3
- RET
-
-
-cglobal intra_pred_ang4_4, 3,4,5
- cmp r4m, byte 32
- cmove r2, r3mp
- lea r3, [ang_table + 18 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
- punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
- palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
- palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
- punpcklqdq m0, m1
- punpcklqdq m2, m1, m3
-
- movh m3, [r3 + 3 * 16] ; [21]
- movhps m3, [r3 - 8 * 16] ; [10]
- movh m4, [r3 + 13 * 16] ; [31]
- movhps m4, [r3 + 2 * 16] ; [20]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_5, 3,4,5
- cmp r4m, byte 31
- cmove r2, r3mp
- lea r3, [ang_table + 10 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
- punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
- palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
- palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
- punpcklqdq m0, m1
- punpcklqdq m2, m1, m3
-
- movh m3, [r3 + 7 * 16] ; [17]
- movhps m3, [r3 - 8 * 16] ; [ 2]
- movh m4, [r3 + 9 * 16] ; [19]
- movhps m4, [r3 - 6 * 16] ; [ 4]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_6, 3,4,5
- cmp r4m, byte 30
- cmove r2, r3mp
- lea r3, [ang_table + 19 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
- punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
- palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
- punpcklqdq m0, m0
- punpcklqdq m2, m2
-
- movh m3, [r3 - 6 * 16] ; [13]
- movhps m3, [r3 + 7 * 16] ; [26]
- movh m4, [r3 - 12 * 16] ; [ 7]
- movhps m4, [r3 + 1 * 16] ; [20]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_7, 3,4,5
- cmp r4m, byte 29
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
- punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
- palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
- punpcklqdq m2, m0, m3
- punpcklqdq m0, m0
-
- movh m3, [r3 - 11 * 16] ; [ 9]
- movhps m3, [r3 - 2 * 16] ; [18]
- movh m4, [r3 + 7 * 16] ; [27]
- movhps m4, [r3 - 16 * 16] ; [ 4]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_8, 3,4,5
- cmp r4m, byte 28
- cmove r2, r3mp
- lea r3, [ang_table + 13 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
- punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
- punpcklqdq m0, m0
- mova m2, m0
-
- movh m3, [r3 - 8 * 16] ; [ 5]
- movhps m3, [r3 - 3 * 16] ; [10]
- movh m4, [r3 + 2 * 16] ; [15]
- movhps m4, [r3 + 7 * 16] ; [20]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_9, 3,4,5
- cmp r4m, byte 27
- cmove r2, r3mp
- lea r3, [ang_table + 4 * 16]
- movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
- palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
- punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
- punpcklqdq m0, m0
- mova m2, m0
-
- movh m3, [r3 - 2 * 16] ; [ 2]
- movhps m3, [r3 - 0 * 16] ; [ 4]
- movh m4, [r3 + 2 * 16] ; [ 6]
- movhps m4, [r3 + 4 * 16] ; [ 8]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_10, 3,3,4
- movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
- pshufb m0, [pb_unpackbd1]
-
- pshufd m1, m0, 1
- movhlps m2, m0
- pshufd m3, m0, 3
- movd [r0 + r1], m1
- movd [r0 + r1 * 2], m2
- lea r1, [r1 * 3]
- movd [r0 + r1], m3
-
- cmp r5m, byte 0
- jz .quit
-
- ; filter
- mov r2, r3mp
- pmovzxbw m0, m0 ; [-1 -1 -1 -1]
- movh m1, [r2] ; [4 3 2 1 0]
- pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
- pshufb m1, [pb_unpackbw1] ; [4 3 2 1]
- psubw m1, m2
- psraw m1, 1
- paddw m0, m1
- packuswb m0, m0
-
-.quit:
- movd [r0], m0
- RET
-
-
-INIT_XMM sse4
-cglobal intra_pred_ang4_26, 4,4,3
- movd m0, [r3 + 1] ; [8 7 6 5 4 3 2 1]
-
- ; store
- movd [r0], m0
- movd [r0 + r1], m0
- movd [r0 + r1 * 2], m0
- lea r3, [r1 * 3]
- movd [r0 + r3], m0
-
- ; filter
- cmp r5m, byte 0
- jz .quit
-
- pshufb m0, [pb_0_8] ; [ 1 1 1 1]
- movh m1, [r2] ; [-4 -3 -2 -1 0]
- pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
- pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1]
- psubw m1, m2
- psraw m1, 1
- paddw m0, m1
- packuswb m0, m0
-
- pextrb [r0], m0, 0
- pextrb [r0 + r1], m0, 1
- pextrb [r0 + r1 * 2], m0, 2
- pextrb [r0 + r3], m0, 3
-
-.quit:
- RET
-
-
-cglobal intra_pred_ang4_11, 3,4,5
- cmp r4m, byte 25
- cmove r2, r3mp
- lea r3, [ang_table + 24 * 16]
- movh m0, [r2] ; [x x x 4 3 2 1 0]
- palignr m1, m0, 1 ; [x x x x 4 3 2 1]
- punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
- punpcklqdq m0, m0
- mova m2, m0
-
- movh m3, [r3 + 6 * 16] ; [24]
- movhps m3, [r3 + 4 * 16] ; [26]
- movh m4, [r3 + 2 * 16] ; [28]
- movhps m4, [r3 + 0 * 16] ; [30]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_12, 3,4,5
- cmp r4m, byte 24
- cmove r2, r3mp
- lea r3, [ang_table + 20 * 16]
- movh m0, [r2] ; [x x x 4 3 2 1 0]
- palignr m1, m0, 1 ; [x x x x 4 3 2 1]
- punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
- punpcklqdq m0, m0
- mova m2, m0
-
- movh m3, [r3 + 7 * 16] ; [27]
- movhps m3, [r3 + 2 * 16] ; [22]
- movh m4, [r3 - 3 * 16] ; [17]
- movhps m4, [r3 - 8 * 16] ; [12]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_13, 4,4,5
- cmp r4m, byte 23
- jnz .load
- xchg r2, r3
-.load:
- movh m1, [r2 - 1] ; [x x 4 3 2 1 0 x]
- palignr m0, m1, 1 ; [x x x 4 3 2 1 0]
- palignr m2, m1, 2 ; [x x x x 4 3 2 1]
- pinsrb m1, [r3 + 4], 0
- punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x]
- punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0]
- punpcklqdq m2, m0, m1
- punpcklqdq m0, m0
-
- lea r3, [ang_table + 21 * 16]
- movh m3, [r3 + 2 * 16] ; [23]
- movhps m3, [r3 - 7 * 16] ; [14]
- movh m4, [r3 - 16 * 16] ; [ 5]
- movhps m4, [r3 + 7 * 16] ; [28]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_14, 4,4,5
- cmp r4m, byte 22
- jnz .load
- xchg r2, r3
-.load:
- movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
- palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
- palignr m1, m2, 2 ; [x x x x 4 3 2 1]
- pinsrb m2, [r3 + 2], 0
- punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
- punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
- punpcklqdq m0, m0
- punpcklqdq m2, m2
-
- lea r3, [ang_table + 19 * 16]
- movh m3, [r3 + 0 * 16] ; [19]
- movhps m3, [r3 - 13 * 16] ; [ 6]
- movh m4, [r3 + 6 * 16] ; [25]
- movhps m4, [r3 - 7 * 16] ; [12]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_15, 4,4,5
- cmp r4m, byte 21
- jnz .load
- xchg r2, r3
-.load:
- movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
- palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
- palignr m1, m2, 2 ; [x x x x 4 3 2 1]
- pinsrb m2, [r3 + 2], 0
- pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
- pinsrb m3, [r3 + 4], 0
- punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
- punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
- punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
- punpcklqdq m0, m2
- punpcklqdq m2, m4
-
- lea r3, [ang_table + 23 * 16]
- movh m3, [r3 - 8 * 16] ; [15]
- movhps m3, [r3 + 7 * 16] ; [30]
- movh m4, [r3 - 10 * 16] ; [13]
- movhps m4, [r3 + 5 * 16] ; [28]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_16, 4,4,5
- cmp r4m, byte 20
- jnz .load
- xchg r2, r3
-.load:
- movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
- palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
- palignr m1, m2, 2 ; [x x x x 4 3 2 1]
- pinsrb m2, [r3 + 2], 0
- pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
- pinsrb m3, [r3 + 3], 0
- punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
- punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
- punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
- punpcklqdq m0, m2
- punpcklqdq m2, m4
-
- lea r3, [ang_table + 19 * 16]
- movh m3, [r3 - 8 * 16] ; [11]
- movhps m3, [r3 + 3 * 16] ; [22]
- movh m4, [r3 - 18 * 16] ; [ 1]
- movhps m4, [r3 - 7 * 16] ; [12]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_17, 4,4,5
- cmp r4m, byte 19
- jnz .load
- xchg r2, r3
-.load:
- movh m3, [r2 - 1] ; [- - 4 3 2 1 0 x]
- palignr m0, m3, 1 ; [- - - 4 3 2 1 0]
- palignr m1, m3, 2 ; [- - - - 4 3 2 1]
- mova m4, m0
- punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
-
- pinsrb m3, [r3 + 1], 0
- punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x]
- punpcklqdq m0, m1
-
- pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y]
- pinsrb m2, [r3 + 2], 0
- pslldq m1, m2, 1 ; [4 3 2 1 0 x y z]
- pinsrb m1, [r3 + 4], 0
- punpcklbw m1, m2 ; [1 0 0 x x y y z]
- punpcklbw m2, m3 ; [2 1 1 0 0 x x y]
- punpcklqdq m2, m1
-
- lea r3, [ang_table + 14 * 16]
- movh m3, [r3 - 8 * 16] ; [ 6]
- movhps m3, [r3 - 2 * 16] ; [12]
- movh m4, [r3 + 4 * 16] ; [18]
- movhps m4, [r3 + 10 * 16] ; [24]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-
-cglobal intra_pred_ang4_18, 4,4,1
- mov r2d, [r2]
- bswap r2d
- movd m0, r2d
- pinsrd m0, [r3 + 1], 1 ; [- 3 2 1 0 -1 -2 -3]
- lea r2, [r1 * 3]
- movd [r0 + r2], m0
- psrldq m0, 1
- movd [r0 + r1 * 2], m0
- psrldq m0, 1
- movd [r0 + r1], m0
- psrldq m0, 1
- movd [r0], m0
- RET
-
;-----------------------------------------------------------------------------------------
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
diff -r 72b600d94c2a -r 9ff0b1b684ea source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp Fri Jan 09 11:02:49 2015 +0530
+++ b/source/test/intrapredharness.cpp Mon Jan 12 13:05:29 2015 +0530
@@ -93,50 +93,6 @@
return true;
}
-bool IntraPredHarness::check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE])
-{
- int j = Predict::ADI_BUF_STRIDE;
- intptr_t stride = FENC_STRIDE;
-
-#if _DEBUG
- memset(pixel_out_vec, 0xCD, OUTPUT_SIZE);
- memset(pixel_out_c, 0xCD, OUTPUT_SIZE);
-#endif
-
- for (int size = 2; size <= 5; size++)
- {
- int width = (1 << size);
- for (int i = 0; i <= 100; i++)
- {
- int bFilter = (width <= 16) && (rand() % 2);
- for (int pmode = 2; pmode <= 34; pmode++)
- {
- if (!opt[pmode][size - 2])
- continue;
-
- pixel * refAbove = pixel_buff + j;
- pixel * refLeft = refAbove + 3 * width;
- refLeft[0] = refAbove[0];
-
- checked(opt[pmode][size - 2], pixel_out_vec, stride, refLeft, refAbove, pmode, bFilter);
- ref[pmode][size - 2](pixel_out_c, stride, refLeft, refAbove, pmode, bFilter);
-
- for (int k = 0; k < width; k++)
- {
- if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
- return false;
- }
-
- reportfail();
- }
-
- j += FENC_STRIDE;
- }
- }
-
- return true;
-}
-
bool IntraPredHarness::check_angular_primitive(const intra_pred_new_t ref[][NUM_TR_SIZE], const intra_pred_new_t opt[][NUM_TR_SIZE])
{
int j = Predict::ADI_BUF_STRIDE;
@@ -254,12 +210,6 @@
}
// NOTE: always call since this function have check pointer in loop
- if (!check_angular_primitive(ref.intra_pred, opt.intra_pred))
- {
- printf("intra_angular failed\n");
- return false;
- }
-
if (!check_angular_primitive(ref.intra_pred_new, opt.intra_pred_new))
{
printf("intra_angular failed\n");
@@ -321,25 +271,6 @@
for (int p = 2; p <= 34; p += 1)
{
int pmode = p; //(rand()%33)+2;
- if (opt.intra_pred[pmode][ii - 2])
- {
- width = (1 << ii);
- bool bFilter = (width <= 16);
- pixel * refAbove = pixel_buff + srcStride;
- pixel * refLeft = refAbove + 3 * width;
- refLeft[0] = refAbove[0];
- printf("intra_ang%dx%d[%2d]", width, width, pmode);
- REPORT_SPEEDUP(opt.intra_pred[pmode][ii - 2], ref.intra_pred[pmode][ii - 2],
- pixel_out_vec, FENC_STRIDE, refAbove, refLeft, pmode, bFilter);
- }
- }
- }
-
- for (int ii = 2; ii <= 5; ii++)
- {
- for (int p = 2; p <= 34; p += 1)
- {
- int pmode = p; //(rand()%33)+2;
if (opt.intra_pred_new[pmode][ii - 2])
{
width = (1 << ii);
diff -r 72b600d94c2a -r 9ff0b1b684ea source/test/intrapredharness.h
--- a/source/test/intrapredharness.h Fri Jan 09 11:02:49 2015 +0530
+++ b/source/test/intrapredharness.h Mon Jan 12 13:05:29 2015 +0530
@@ -43,7 +43,6 @@
bool check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);
bool check_planar_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);
- bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]);
bool check_angular_primitive(const intra_pred_new_t ref[][NUM_TR_SIZE], const intra_pred_new_t opt[][NUM_TR_SIZE]);
bool check_allangs_new_primitive(const intra_allangs_new_t ref[], const intra_allangs_new_t opt[]);
More information about the x265-devel
mailing list