[x265] [PATCH 06 of 29] intra_pred_dc_new: updated asm and unit test code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:14 CET 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1418718739 -19800
# Tue Dec 16 14:02:19 2014 +0530
# Node ID 70b4e0c84320df0b7443e5aea6e110c1bf483684
# Parent f4daa8744d08b569ae652737c4506b397dfb55cb
intra_pred_dc_new: updated asm and unit test code
diff -r f4daa8744d08 -r 70b4e0c84320 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 12 12:34:37 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 16 14:02:19 2014 +0530
@@ -1705,6 +1705,11 @@
p.intra_pred[1][BLOCK_16x16] = x265_intra_pred_dc16_sse4;
p.intra_pred[1][BLOCK_32x32] = x265_intra_pred_dc32_sse4;
+ p.intra_pred_new[1][BLOCK_4x4] = x265_intra_pred_dc4_new_sse4;
+ p.intra_pred_new[1][BLOCK_8x8] = x265_intra_pred_dc8_new_sse4;
+ p.intra_pred_new[1][BLOCK_16x16] = x265_intra_pred_dc16_new_sse4;
+ p.intra_pred_new[1][BLOCK_32x32] = x265_intra_pred_dc32_new_sse4;
+
INTRA_ANG_SSE4_COMMON(sse4);
INTRA_ANG_SSE4(sse4);
diff -r f4daa8744d08 -r 70b4e0c84320 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Mon Jan 12 12:34:37 2015 +0530
+++ b/source/common/x86/intrapred.h Tue Dec 16 14:02:19 2014 +0530
@@ -31,6 +31,11 @@
void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter);
void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int filter);
+void x265_intra_pred_dc4_new_sse4 (pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter);
+void x265_intra_pred_dc8_new_sse4(pixel* dst, intptr_t dstStride, pixel* above, int, int filter);
+void x265_intra_pred_dc16_new_sse4(pixel* dst, intptr_t dstStride, pixel* above, int, int filter);
+void x265_intra_pred_dc32_new_sse4(pixel* dst, intptr_t dstStride, pixel* above, int, int filter);
+
void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, pixel* above, pixel* left, int, int);
diff -r f4daa8744d08 -r 70b4e0c84320 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Mon Jan 12 12:34:37 2015 +0530
+++ b/source/common/x86/intrapred8.asm Tue Dec 16 14:02:19 2014 +0530
@@ -136,6 +136,67 @@
.end:
RET
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc4_new, 5,5,3
+ inc r2
+ pxor m0, m0
+ movd m1, [r2]
+ movd m2, [r2 + 8]
+ punpckldq m1, m2
+ psadbw m1, m0 ; m1 = sum
+
+ test r4d, r4d
+
+ mov r4d, 4096
+ movd m2, r4d
+ pmulhrsw m1, m2 ; m1 = (sum + 4) / 8
+ movd r4d, m1 ; r4d = dc_val
+ pshufb m1, m0 ; m1 = byte [dc_val ...]
+
+ ; store DC 4x4
+ lea r3, [r1 * 3]
+ movd [r0], m1
+ movd [r0 + r1], m1
+ movd [r0 + r1 * 2], m1
+ movd [r0 + r3], m1
+
+ ; do DC filter
+ jz .end
+ lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
+ add r4d, r3d ; r4d = DC * 3 + 2
+ movd m1, r4d
+ pshuflw m1, m1, 0 ; m1 = pixDCx3
+
+ ; filter top
+ pmovzxbw m2, [r2]
+ paddw m2, m1
+ psraw m2, 2
+ packuswb m2, m2
+ movd [r0], m2 ; overwrite top-left pixel, we will update it later
+
+ ; filter top-left
+ movzx r4d, byte [r2 + 8]
+ add r3d, r4d
+ movzx r4d, byte [r2]
+ add r4d, r3d
+ shr r4d, 2
+ mov [r0], r4b
+
+ ; filter left
+ add r0, r1
+ pmovzxbw m2, [r2 + 9]
+ paddw m2, m1
+ psraw m2, 2
+ packuswb m2, m2
+ pextrb [r0], m2, 0
+ pextrb [r0 + r1], m2, 1
+ pextrb [r0 + r1 * 2], m2, 2
+
+.end:
+ RET
;-------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
@@ -217,6 +278,85 @@
.end:
RET
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc8_new, 5, 7, 3
+ lea r3, [r2 + 17]
+ inc r2
+ pxor m0, m0
+ movh m1, [r2]
+ movh m2, [r3]
+ punpcklqdq m1, m2
+ psadbw m1, m0
+ pshufd m2, m1, 2
+ paddw m1, m2
+
+ movd r5d, m1
+ add r5d, 8
+ shr r5d, 4 ; sum = sum / 16
+ movd m1, r5d
+ pshufb m1, m0 ; m1 = byte [dc_val ...]
+
+ test r4d, r4d
+
+ ; store DC 8x8
+ mov r6, r0
+ movh [r0], m1
+ movh [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movh [r0], m1
+ movh [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movh [r0], m1
+ movh [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movh [r0], m1
+ movh [r0 + r1], m1
+
+ ; Do DC Filter
+ jz .end
+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
+ add r5d, r4d ; r5d = DC * 3 + 2
+ movd m1, r5d
+ pshuflw m1, m1, 0 ; m1 = pixDCx3
+ pshufd m1, m1, 0
+
+ ; filter top
+ pmovzxbw m2, [r2]
+ paddw m2, m1
+ psraw m2, 2
+ packuswb m2, m2
+ movh [r6], m2
+
+ ; filter top-left
+ movzx r5d, byte [r3]
+ add r4d, r5d
+ movzx r5d, byte [r2]
+ add r5d, r4d
+ shr r5d, 2
+ mov [r6], r5b
+
+ ; filter left
+ add r6, r1
+ pmovzxbw m2, [r3 + 1]
+ paddw m2, m1
+ psraw m2, 2
+ packuswb m2, m2
+ pextrb [r6], m2, 0
+ pextrb [r6 + r1], m2, 1
+ pextrb [r6 + 2 * r1], m2, 2
+ lea r6, [r6 + r1 * 2]
+ pextrb [r6 + r1], m2, 3
+ pextrb [r6 + r1 * 2], m2, 4
+ pextrb [r6 + r1 * 4], m2, 6
+ lea r1, [r1 * 3]
+ pextrb [r6 + r1], m2, 5
+
+.end:
+ RET
+
;-------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
;-------------------------------------------------------------------------------------------
@@ -332,6 +472,120 @@
.end:
RET
+;--------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;--------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc16_new, 5, 7, 4
+ lea r3, [r2 + 33]
+ inc r2
+ pxor m0, m0
+ movu m1, [r2]
+ movu m2, [r3]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddw m1, m2
+ pshufd m2, m1, 2
+ paddw m1, m2
+
+ movd r5d, m1
+ add r5d, 16
+ shr r5d, 5 ; sum = sum / 32
+ movd m1, r5d
+ pshufb m1, m0 ; m1 = byte [dc_val ...]
+
+ test r4d, r4d
+
+ ; store DC 16x16
+ mov r6, r0
+ movu [r0], m1
+ movu [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ lea r0, [r0 + r1 * 2]
+ movu [r0], m1
+ movu [r0 + r1], m1
+
+ ; Do DC Filter
+ jz .end
+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
+ add r5d, r4d ; r5d = DC * 3 + 2
+ movd m1, r5d
+ pshuflw m1, m1, 0 ; m1 = pixDCx3
+ pshufd m1, m1, 0
+
+ ; filter top
+ pmovzxbw m2, [r2]
+ paddw m2, m1
+ psraw m2, 2
+ packuswb m2, m2
+ movh [r6], m2
+ pmovzxbw m3, [r2 + 8]
+ paddw m3, m1
+ psraw m3, 2
+ packuswb m3, m3
+ movh [r6 + 8], m3
+
+ ; filter top-left
+ movzx r5d, byte [r3]
+ add r4d, r5d
+ movzx r5d, byte [r2]
+ add r5d, r4d
+ shr r5d, 2
+ mov [r6], r5b
+
+ ; filter left
+ add r6, r1
+ pmovzxbw m2, [r3 + 1]
+ paddw m2, m1
+ psraw m2, 2
+ packuswb m2, m2
+ pextrb [r6], m2, 0
+ pextrb [r6 + r1], m2, 1
+ pextrb [r6 + r1 * 2], m2, 2
+ lea r6, [r6 + r1 * 2]
+ pextrb [r6 + r1], m2, 3
+ pextrb [r6 + r1 * 2], m2, 4
+ lea r6, [r6 + r1 * 2]
+ pextrb [r6 + r1], m2, 5
+ pextrb [r6 + r1 * 2], m2, 6
+ lea r6, [r6 + r1 * 2]
+ pextrb [r6 + r1], m2, 7
+
+ pmovzxbw m3, [r3 + 9]
+ paddw m3, m1
+ psraw m3, 2
+ packuswb m3, m3
+ pextrb [r6 + r1 * 2], m3, 0
+ lea r6, [r6 + r1 * 2]
+ pextrb [r6 + r1], m3, 1
+ pextrb [r6 + r1 * 2], m3, 2
+ lea r6, [r6 + r1 * 2]
+ pextrb [r6 + r1], m3, 3
+ pextrb [r6 + r1 * 2], m3, 4
+ lea r6, [r6 + r1 * 2]
+ pextrb [r6 + r1], m3, 5
+ pextrb [r6 + r1 * 2], m3, 6
+
+.end:
+ RET
+
;-------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
;-------------------------------------------------------------------------------------------
@@ -406,6 +660,80 @@
RET
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc32_new, 3, 5, 5
+ lea r3, [r2 + 65]
+ inc r2
+ pxor m0, m0
+ movu m1, [r2]
+ movu m2, [r2 + 16]
+ movu m3, [r3]
+ movu m4, [r3 + 16]
+ psadbw m1, m0
+ psadbw m2, m0
+ psadbw m3, m0
+ psadbw m4, m0
+ paddw m1, m2
+ paddw m3, m4
+ paddw m1, m3
+ pshufd m2, m1, 2
+ paddw m1, m2
+
+ movd r4d, m1
+ add r4d, 32
+ shr r4d, 6 ; sum = sum / 64
+ movd m1, r4d
+ pshufb m1, m0 ; m1 = byte [dc_val ...]
+
+%rep 2
+ ; store DC 16x16
+ movu [r0], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1 + 16],m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1 + 16],m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1 + 16],m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1 + 16],m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1 + 16],m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1 + 16],m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1 + 16],m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m1
+ movu [r0 + r1], m1
+ movu [r0 + 16], m1
+ movu [r0 + r1 + 16],m1
+ lea r0, [r0 + 2 * r1]
+%endrep
+
+ RET
+
;-----------------------------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
;-----------------------------------------------------------------------------------------------------------
diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp Mon Jan 12 12:34:37 2015 +0530
+++ b/source/test/intrapredharness.cpp Tue Dec 16 14:02:19 2014 +0530
@@ -71,6 +71,38 @@
return true;
}
+bool IntraPredHarness::check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width)
+{
+ int j = Predict::ADI_BUF_STRIDE;
+ intptr_t stride = FENC_STRIDE;
+
+#if _DEBUG
+ memset(pixel_out_vec, 0xCD, OUTPUT_SIZE);
+ memset(pixel_out_c, 0xCD, OUTPUT_SIZE);
+#endif
+
+ for (int i = 0; i <= 100; i++)
+ {
+ int rand_filter = rand() & 1;
+ if (width > 16)
+ rand_filter = 0;
+
+ ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
+ opt(pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
+
+ for (int k = 0; k < width; k++)
+ {
+ if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
+ return false;
+ }
+
+ reportfail();
+ j += FENC_STRIDE;
+ }
+
+ return true;
+}
+
bool IntraPredHarness::check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width)
{
int j = Predict::ADI_BUF_STRIDE;
@@ -222,6 +254,15 @@
return false;
}
}
+ if (opt.intra_pred_new[1][i])
+ {
+ const int size = (1 << (i + 2));
+ if (!check_dc_primitive(ref.intra_pred_new[1][i], opt.intra_pred_new[1][i], size))
+ {
+ printf("intra_dc %dx%d failed\n", size, size);
+ return false;
+ }
+ }
}
// NOTE: always call since this function have check pointer in loop
@@ -279,6 +320,18 @@
REPORT_SPEEDUP(opt.intra_pred_allangs[i], ref.intra_pred_allangs[i],
pixel_out_33_vec, refAbove, refLeft, refAbove, refLeft, bFilter);
}
+ if (opt.intra_pred_new[1][i])
+ {
+ printf("intra_dc_new_%dx%d[f=0]", size, size);
+ REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 0);
+ if (size <= 16)
+ {
+ printf("intra_dc_new_%dx%d[f=1]", size, size);
+ REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 1);
+ }
+ }
}
for (int ii = 2; ii <= 5; ii++)
diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.h
--- a/source/test/intrapredharness.h Mon Jan 12 12:34:37 2015 +0530
+++ b/source/test/intrapredharness.h Tue Dec 16 14:02:19 2014 +0530
@@ -42,6 +42,7 @@
pixel pixel_out_33_vec[OUTPUT_SIZE_33];
bool check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width);
+ bool check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);
bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);
bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]);
bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);
More information about the x265-devel
mailing list