[x265] [PATCH 06 of 29] intra_pred_dc_new: updated asm and unit test code
chen
chenm003 at 163.com
Tue Jan 13 09:29:18 CET 2015
At 2015-01-13 15:11:14,dnyaneshwar at multicorewareinc.com wrote:
># HG changeset patch
># User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
># Date 1418718739 -19800
># Tue Dec 16 14:02:19 2014 +0530
># Node ID 70b4e0c84320df0b7443e5aea6e110c1bf483684
># Parent f4daa8744d08b569ae652737c4506b397dfb55cb
>intra_pred_dc_new: updated asm and unit test code
>
>+;---------------------------------------------------------------------------------------------
>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>+;---------------------------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal intra_pred_dc4_new, 5,5,3
>+ inc r2
>+ pxor m0, m0
>+ movd m1, [r2]
>+ movd m2, [r2 + 8]
>+ punpckldq m1, m2
>+ psadbw m1, m0 ; m1 = sum
>+
>+ test r4d, r4d
>+
>+ mov r4d, 4096
>+ movd m2, r4d
>+ pmulhrsw m1, m2 ; m1 = (sum + 4) / 8
there have a constant pw_4096
>+ movd r4d, m1 ; r4d = dc_val
>+ pshufb m1, m0 ; m1 = byte [dc_val ...]
>+
>+ ; store DC 4x4
>+ lea r3, [r1 * 3]
>+ movd [r0], m1
>+ movd [r0 + r1], m1
>+ movd [r0 + r1 * 2], m1
>+ movd [r0 + r3], m1
>+
>+ ; do DC filter
>+ jz .end
>+ lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
>+ add r4d, r3d ; r4d = DC * 3 + 2
>+ movd m1, r4d
>+ pshuflw m1, m1, 0 ; m1 = pixDCx3
we have more free register, so we can keep dc in register to reduce operators
>+
>+ ; filter top
>+ pmovzxbw m2, [r2]
>+ paddw m2, m1
>+ psraw m2, 2
combo above modify, we can make (x + 2)>>2 with pmulhrsw
>+ packuswb m2, m2
>+ movd [r0], m2 ; overwrite top-left pixel, we will update it later
>+
>+ ; filter top-left
>+ movzx r4d, byte [r2 + 8]
>+ add r3d, r4d
>+ movzx r4d, byte [r2]
>+ add r4d, r3d
>+ shr r4d, 2
>+ mov [r0], r4b
>+
>+ ; filter left
>+ add r0, r1
>+ pmovzxbw m2, [r2 + 9]
>+ paddw m2, m1
>+ psraw m2, 2
>+ packuswb m2, m2
>+ pextrb [r0], m2, 0
>+ pextrb [r0 + r1], m2, 1
>+ pextrb [r0 + r1 * 2], m2, 2
>+
>+.end:
>+ RET
>
> ;-------------------------------------------------------------------------------------------
> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
>@@ -217,6 +278,85 @@
> .end:
> RET
>
>+;---------------------------------------------------------------------------------------------
>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>+;---------------------------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal intra_pred_dc8_new, 5, 7, 3
>+ lea r3, [r2 + 17]
>+ inc r2
>+ pxor m0, m0
>+ movh m1, [r2]
>+ movh m2, [r3]
>+ punpcklqdq m1, m2
how about movhps?
>+ psadbw m1, m0
>+ pshufd m2, m1, 2
>+ paddw m1, m2
paddd ?
>+
>+ movd r5d, m1
>+ add r5d, 8
>+ shr r5d, 4 ; sum = sum / 16
same as 4x4, we can reduce convert between general and SSE register
>+ movd m1, r5d
>+ pshufb m1, m0 ; m1 = byte [dc_val ...]
>+
>+ test r4d, r4d
>+
>+ ; store DC 8x8
>+ mov r6, r0
>+ movh [r0], m1
>+ movh [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movh [r0], m1
>+ movh [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movh [r0], m1
>+ movh [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movh [r0], m1
>+ movh [r0 + r1], m1
>+
>+ ; Do DC Filter
>+ jz .end
>+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
>+ add r5d, r4d ; r5d = DC * 3 + 2
>+ movd m1, r5d
>+ pshuflw m1, m1, 0 ; m1 = pixDCx3
>+ pshufd m1, m1, 0
>+
>+ ; filter top
>+ pmovzxbw m2, [r2]
>+ paddw m2, m1
>+ psraw m2, 2
>+ packuswb m2, m2
>+ movh [r6], m2
>+
>+ ; filter top-left
>+ movzx r5d, byte [r3]
>+ add r4d, r5d
>+ movzx r5d, byte [r2]
>+ add r5d, r4d
>+ shr r5d, 2
>+ mov [r6], r5b
>+
>+ ; filter left
>+ add r6, r1
>+ pmovzxbw m2, [r3 + 1]
>+ paddw m2, m1
>+ psraw m2, 2
>+ packuswb m2, m2
>+ pextrb [r6], m2, 0
>+ pextrb [r6 + r1], m2, 1
>+ pextrb [r6 + 2 * r1], m2, 2
>+ lea r6, [r6 + r1 * 2]
>+ pextrb [r6 + r1], m2, 3
>+ pextrb [r6 + r1 * 2], m2, 4
>+ pextrb [r6 + r1 * 4], m2, 6
>+ lea r1, [r1 * 3]
how about reuse r0 or others to reduce above lea_r6
>+ pextrb [r6 + r1], m2, 5
>+
>+.end:
>+ RET
>+
> ;-------------------------------------------------------------------------------------------
> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
> ;-------------------------------------------------------------------------------------------
>@@ -332,6 +472,120 @@
> .end:
> RET
>
>+;--------------------------------------------------------------------------------------------
>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>+;--------------------------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal intra_pred_dc16_new, 5, 7, 4
>+ lea r3, [r2 + 33]
>+ inc r2
>+ pxor m0, m0
>+ movu m1, [r2]
>+ movu m2, [r3]
>+ psadbw m1, m0
>+ psadbw m2, m0
>+ paddw m1, m2
>+ pshufd m2, m1, 2
>+ paddw m1, m2
>+
>+ movd r5d, m1
>+ add r5d, 16
>+ shr r5d, 5 ; sum = sum / 32
>+ movd m1, r5d
>+ pshufb m1, m0 ; m1 = byte [dc_val ...]
>+
>+ test r4d, r4d
>+
>+ ; store DC 16x16
>+ mov r6, r0
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ lea r0, [r0 + r1 * 2]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+
>+ ; Do DC Filter
>+ jz .end
>+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
>+ add r5d, r4d ; r5d = DC * 3 + 2
>+ movd m1, r5d
>+ pshuflw m1, m1, 0 ; m1 = pixDCx3
>+ pshufd m1, m1, 0
>+
>+ ; filter top
>+ pmovzxbw m2, [r2]
>+ paddw m2, m1
>+ psraw m2, 2
>+ packuswb m2, m2
>+ movh [r6], m2
>+ pmovzxbw m3, [r2 + 8]
>+ paddw m3, m1
>+ psraw m3, 2
>+ packuswb m3, m3
>+ movh [r6 + 8], m3
>+
>+ ; filter top-left
>+ movzx r5d, byte [r3]
>+ add r4d, r5d
>+ movzx r5d, byte [r2]
>+ add r5d, r4d
>+ shr r5d, 2
>+ mov [r6], r5b
>+
>+ ; filter left
>+ add r6, r1
>+ pmovzxbw m2, [r3 + 1]
>+ paddw m2, m1
>+ psraw m2, 2
>+ packuswb m2, m2
>+ pextrb [r6], m2, 0
>+ pextrb [r6 + r1], m2, 1
>+ pextrb [r6 + r1 * 2], m2, 2
>+ lea r6, [r6 + r1 * 2]
>+ pextrb [r6 + r1], m2, 3
>+ pextrb [r6 + r1 * 2], m2, 4
>+ lea r6, [r6 + r1 * 2]
>+ pextrb [r6 + r1], m2, 5
>+ pextrb [r6 + r1 * 2], m2, 6
>+ lea r6, [r6 + r1 * 2]
>+ pextrb [r6 + r1], m2, 7
>+
>+ pmovzxbw m3, [r3 + 9]
>+ paddw m3, m1
>+ psraw m3, 2
>+ packuswb m3, m3
>+ pextrb [r6 + r1 * 2], m3, 0
>+ lea r6, [r6 + r1 * 2]
>+ pextrb [r6 + r1], m3, 1
>+ pextrb [r6 + r1 * 2], m3, 2
>+ lea r6, [r6 + r1 * 2]
>+ pextrb [r6 + r1], m3, 3
>+ pextrb [r6 + r1 * 2], m3, 4
>+ lea r6, [r6 + r1 * 2]
>+ pextrb [r6 + r1], m3, 5
>+ pextrb [r6 + r1 * 2], m3, 6
>+
>+.end:
>+ RET
>+
> ;-------------------------------------------------------------------------------------------
> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
> ;-------------------------------------------------------------------------------------------
>@@ -406,6 +660,80 @@
>
> RET
>
>+;---------------------------------------------------------------------------------------------
>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>+;---------------------------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal intra_pred_dc32_new, 3, 5, 5
>+ lea r3, [r2 + 65]
>+ inc r2
>+ pxor m0, m0
>+ movu m1, [r2]
>+ movu m2, [r2 + 16]
>+ movu m3, [r3]
>+ movu m4, [r3 + 16]
>+ psadbw m1, m0
>+ psadbw m2, m0
>+ psadbw m3, m0
>+ psadbw m4, m0
>+ paddw m1, m2
>+ paddw m3, m4
>+ paddw m1, m3
>+ pshufd m2, m1, 2
>+ paddw m1, m2
>+
>+ movd r4d, m1
>+ add r4d, 32
>+ shr r4d, 6 ; sum = sum / 64
>+ movd m1, r4d
>+ pshufb m1, m0 ; m1 = byte [dc_val ...]
>+
>+%rep 2
>+ ; store DC 16x16
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ movu [r0 + 16], m1
>+ movu [r0 + r1 + 16],m1
>+ lea r0, [r0 + 2 * r1]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ movu [r0 + 16], m1
>+ movu [r0 + r1 + 16],m1
>+ lea r0, [r0 + 2 * r1]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ movu [r0 + 16], m1
>+ movu [r0 + r1 + 16],m1
>+ lea r0, [r0 + 2 * r1]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ movu [r0 + 16], m1
>+ movu [r0 + r1 + 16],m1
>+ lea r0, [r0 + 2 * r1]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ movu [r0 + 16], m1
>+ movu [r0 + r1 + 16],m1
>+ lea r0, [r0 + 2 * r1]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ movu [r0 + 16], m1
>+ movu [r0 + r1 + 16],m1
>+ lea r0, [r0 + 2 * r1]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ movu [r0 + 16], m1
>+ movu [r0 + r1 + 16],m1
>+ lea r0, [r0 + 2 * r1]
>+ movu [r0], m1
>+ movu [r0 + r1], m1
>+ movu [r0 + 16], m1
>+ movu [r0 + r1 + 16],m1
>+ lea r0, [r0 + 2 * r1]
>+%endrep
>+
>+ RET
>+
> ;-----------------------------------------------------------------------------------------------------------
> ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
> ;-----------------------------------------------------------------------------------------------------------
>diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.cpp
>--- a/source/test/intrapredharness.cpp Mon Jan 12 12:34:37 2015 +0530
>+++ b/source/test/intrapredharness.cpp Tue Dec 16 14:02:19 2014 +0530
>@@ -71,6 +71,38 @@
> return true;
> }
>
>+bool IntraPredHarness::check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width)
>+{
>+ int j = Predict::ADI_BUF_STRIDE;
>+ intptr_t stride = FENC_STRIDE;
>+
>+#if _DEBUG
>+ memset(pixel_out_vec, 0xCD, OUTPUT_SIZE);
>+ memset(pixel_out_c, 0xCD, OUTPUT_SIZE);
>+#endif
>+
>+ for (int i = 0; i <= 100; i++)
>+ {
>+ int rand_filter = rand() & 1;
>+ if (width > 16)
>+ rand_filter = 0;
>+
>+ ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
>+ opt(pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
>+
>+ for (int k = 0; k < width; k++)
>+ {
>+ if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
>+ return false;
>+ }
>+
>+ reportfail();
>+ j += FENC_STRIDE;
>+ }
>+
>+ return true;
>+}
>+
> bool IntraPredHarness::check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width)
> {
> int j = Predict::ADI_BUF_STRIDE;
>@@ -222,6 +254,15 @@
> return false;
> }
> }
>+ if (opt.intra_pred_new[1][i])
>+ {
>+ const int size = (1 << (i + 2));
>+ if (!check_dc_primitive(ref.intra_pred_new[1][i], opt.intra_pred_new[1][i], size))
>+ {
>+ printf("intra_dc %dx%d failed\n", size, size);
>+ return false;
>+ }
>+ }
> }
>
> // NOTE: always call since this function have check pointer in loop
>@@ -279,6 +320,18 @@
> REPORT_SPEEDUP(opt.intra_pred_allangs[i], ref.intra_pred_allangs[i],
> pixel_out_33_vec, refAbove, refLeft, refAbove, refLeft, bFilter);
> }
>+ if (opt.intra_pred_new[1][i])
>+ {
>+ printf("intra_dc_new_%dx%d[f=0]", size, size);
>+ REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
>+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 0);
>+ if (size <= 16)
>+ {
>+ printf("intra_dc_new_%dx%d[f=1]", size, size);
>+ REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
>+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 1);
>+ }
>+ }
> }
>
> for (int ii = 2; ii <= 5; ii++)
>diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.h
>--- a/source/test/intrapredharness.h Mon Jan 12 12:34:37 2015 +0530
>+++ b/source/test/intrapredharness.h Tue Dec 16 14:02:19 2014 +0530
>@@ -42,6 +42,7 @@
> pixel pixel_out_33_vec[OUTPUT_SIZE_33];
>
> bool check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width);
>+ bool check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);
> bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);
> bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]);
> bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150113/979e7621/attachment-0001.html>
More information about the x265-devel
mailing list