[x265] [PATCH 06 of 29] intra_pred_dc_new: updated asm and unit test code
Dnyaneshwar Gorade
dnyaneshwar at multicorewareinc.com
Tue Jan 13 10:46:11 CET 2015
ok Min, I will modify the code accordingly.
On Tue, Jan 13, 2015 at 1:59 PM, chen <chenm003 at 163.com> wrote:
>
>
> At 2015-01-13 15:11:14,dnyaneshwar at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> ># Date 1418718739 -19800
> ># Tue Dec 16 14:02:19 2014 +0530
> ># Node ID 70b4e0c84320df0b7443e5aea6e110c1bf483684
> ># Parent f4daa8744d08b569ae652737c4506b397dfb55cb
> >intra_pred_dc_new: updated asm and unit test code
> >
>
> >+;---------------------------------------------------------------------------------------------
>
> >+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>
> >+;---------------------------------------------------------------------------------------------
> >+INIT_XMM sse4
> >+cglobal intra_pred_dc4_new, 5,5,3
> >+ inc r2
> >+ pxor m0, m0
> >+ movd m1, [r2]
> >+ movd m2, [r2 + 8]
> >+ punpckldq m1, m2
> >+ psadbw m1, m0 ; m1 = sum
> >+
> >+ test r4d, r4d
> >+
> >+ mov r4d, 4096
> >+ movd m2, r4d
> >+ pmulhrsw m1, m2 ; m1 = (sum + 4) / 8
> there have a constant pw_4096
>
> >+ movd r4d, m1 ; r4d = dc_val
> >+ pshufb m1, m0 ; m1 = byte [dc_val ...]
> >+
> >+ ; store DC 4x4
> >+ lea r3, [r1 * 3]
> >+ movd [r0], m1
> >+ movd [r0 + r1], m1
> >+ movd [r0 + r1 * 2], m1
> >+ movd [r0 + r3], m1
> >+
> >+ ; do DC filter
> >+ jz .end
> >+ lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2
> >+ add r4d, r3d ; r4d = DC * 3 + 2
> >+ movd m1, r4d
> >+ pshuflw m1, m1, 0 ; m1 = pixDCx3
> we have more free register, so we can keep dc in register to reduce
> operators
>
> >+
> >+ ; filter top
> >+ pmovzxbw m2, [r2]
> >+ paddw m2, m1
> >+ psraw m2, 2
> combo above modify, we can make (x + 2)>>2 with pmulhrsw
>
> >+ packuswb m2, m2
>
> >+ movd [r0], m2 ; overwrite top-left pixel, we will update it later
> >+
> >+ ; filter top-left
> >+ movzx r4d, byte [r2 + 8]
> >+ add r3d, r4d
> >+ movzx r4d, byte [r2]
> >+ add r4d, r3d
> >+ shr r4d, 2
> >+ mov [r0], r4b
> >+
> >+ ; filter left
> >+ add r0, r1
> >+ pmovzxbw m2, [r2 + 9]
> >+ paddw m2, m1
> >+ psraw m2, 2
> >+ packuswb m2, m2
> >+ pextrb [r0], m2, 0
> >+ pextrb [r0 + r1], m2, 1
> >+ pextrb [r0 + r1 * 2], m2, 2
> >+
> >+.end:
> >+ RET
> >
>
> > ;-------------------------------------------------------------------------------------------
>
> > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
> >@@ -217,6 +278,85 @@
> > .end:
> > RET
> >
>
> >+;---------------------------------------------------------------------------------------------
>
> >+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>
> >+;---------------------------------------------------------------------------------------------
> >+INIT_XMM sse4
> >+cglobal intra_pred_dc8_new, 5, 7, 3
> >+ lea r3, [r2 + 17]
> >+ inc r2
> >+ pxor m0, m0
> >+ movh m1, [r2]
> >+ movh m2, [r3]
> >+ punpcklqdq m1, m2
> how about movhps?
>
> >+ psadbw m1, m0
> >+ pshufd m2, m1, 2
> >+ paddw m1, m2
> paddd ?
>
> >+
> >+ movd r5d, m1
> >+ add r5d, 8
> >+ shr r5d, 4 ; sum = sum / 16
> same as 4x4, we can reduce convert between general and SSE register
>
> >+ movd m1, r5d
> >+ pshufb m1, m0 ; m1 = byte [dc_val ...]
> >+
> >+ test r4d, r4d
> >+
> >+ ; store DC 8x8
> >+ mov r6, r0
> >+ movh [r0], m1
> >+ movh [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movh [r0], m1
> >+ movh [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movh [r0], m1
> >+ movh [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movh [r0], m1
> >+ movh [r0 + r1], m1
> >+
> >+ ; Do DC Filter
> >+ jz .end
> >+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
> >+ add r5d, r4d ; r5d = DC * 3 + 2
> >+ movd m1, r5d
> >+ pshuflw m1, m1, 0 ; m1 = pixDCx3
> >+ pshufd m1, m1, 0
> >+
> >+ ; filter top
> >+ pmovzxbw m2, [r2]
> >+ paddw m2, m1
> >+ psraw m2, 2
> >+ packuswb m2, m2
> >+ movh [r6], m2
> >+
> >+ ; filter top-left
> >+ movzx r5d, byte [r3]
> >+ add r4d, r5d
> >+ movzx r5d, byte [r2]
> >+ add r5d, r4d
> >+ shr r5d, 2
> >+ mov [r6], r5b
> >+
> >+ ; filter left
> >+ add r6, r1
> >+ pmovzxbw m2, [r3 + 1]
> >+ paddw m2, m1
> >+ psraw m2, 2
> >+ packuswb m2, m2
> >+ pextrb [r6], m2, 0
> >+ pextrb [r6 + r1], m2, 1
> >+ pextrb [r6 + 2 * r1], m2, 2
> >+ lea r6, [r6 + r1 * 2]
> >+ pextrb [r6 + r1], m2, 3
> >+ pextrb [r6 + r1 * 2], m2, 4
> >+ pextrb [r6 + r1 * 4], m2, 6
> >+ lea r1, [r1 * 3]
> how about reuse r0 or others to reduce above lea_r6
>
> >+ pextrb [r6 + r1], m2, 5
> >+
> >+.end:
> >+ RET
> >+
>
> > ;-------------------------------------------------------------------------------------------
>
> > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
>
> > ;-------------------------------------------------------------------------------------------
> >@@ -332,6 +472,120 @@
> > .end:
> > RET
> >
>
> >+;--------------------------------------------------------------------------------------------
>
> >+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>
> >+;--------------------------------------------------------------------------------------------
> >+INIT_XMM sse4
> >+cglobal intra_pred_dc16_new, 5, 7, 4
> >+ lea r3, [r2 + 33]
> >+ inc r2
> >+ pxor m0, m0
> >+ movu m1, [r2]
> >+ movu m2, [r3]
> >+ psadbw m1, m0
> >+ psadbw m2, m0
> >+ paddw m1, m2
> >+ pshufd m2, m1, 2
> >+ paddw m1, m2
> >+
> >+ movd r5d, m1
> >+ add r5d, 16
> >+ shr r5d, 5 ; sum = sum / 32
> >+ movd m1, r5d
> >+ pshufb m1, m0 ; m1 = byte [dc_val ...]
> >+
> >+ test r4d, r4d
> >+
> >+ ; store DC 16x16
> >+ mov r6, r0
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ lea r0, [r0 + r1 * 2]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+
> >+ ; Do DC Filter
> >+ jz .end
> >+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
> >+ add r5d, r4d ; r5d = DC * 3 + 2
> >+ movd m1, r5d
> >+ pshuflw m1, m1, 0 ; m1 = pixDCx3
> >+ pshufd m1, m1, 0
> >+
> >+ ; filter top
> >+ pmovzxbw m2, [r2]
> >+ paddw m2, m1
> >+ psraw m2, 2
> >+ packuswb m2, m2
> >+ movh [r6], m2
> >+ pmovzxbw m3, [r2 + 8]
> >+ paddw m3, m1
> >+ psraw m3, 2
> >+ packuswb m3, m3
> >+ movh [r6 + 8], m3
> >+
> >+ ; filter top-left
> >+ movzx r5d, byte [r3]
> >+ add r4d, r5d
> >+ movzx r5d, byte [r2]
> >+ add r5d, r4d
> >+ shr r5d, 2
> >+ mov [r6], r5b
> >+
> >+ ; filter left
> >+ add r6, r1
> >+ pmovzxbw m2, [r3 + 1]
> >+ paddw m2, m1
> >+ psraw m2, 2
> >+ packuswb m2, m2
> >+ pextrb [r6], m2, 0
> >+ pextrb [r6 + r1], m2, 1
> >+ pextrb [r6 + r1 * 2], m2, 2
> >+ lea r6, [r6 + r1 * 2]
> >+ pextrb [r6 + r1], m2, 3
> >+ pextrb [r6 + r1 * 2], m2, 4
> >+ lea r6, [r6 + r1 * 2]
> >+ pextrb [r6 + r1], m2, 5
> >+ pextrb [r6 + r1 * 2], m2, 6
> >+ lea r6, [r6 + r1 * 2]
> >+ pextrb [r6 + r1], m2, 7
> >+
> >+ pmovzxbw m3, [r3 + 9]
> >+ paddw m3, m1
> >+ psraw m3, 2
> >+ packuswb m3, m3
> >+ pextrb [r6 + r1 * 2], m3, 0
> >+ lea r6, [r6 + r1 * 2]
> >+ pextrb [r6 + r1], m3, 1
> >+ pextrb [r6 + r1 * 2], m3, 2
> >+ lea r6, [r6 + r1 * 2]
> >+ pextrb [r6 + r1], m3, 3
> >+ pextrb [r6 + r1 * 2], m3, 4
> >+ lea r6, [r6 + r1 * 2]
> >+ pextrb [r6 + r1], m3, 5
> >+ pextrb [r6 + r1 * 2], m3, 6
> >+
> >+.end:
> >+ RET
> >+
>
> > ;-------------------------------------------------------------------------------------------
>
> > ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
>
> > ;-------------------------------------------------------------------------------------------
> >@@ -406,6 +660,80 @@
> >
> > RET
> >
>
> >+;---------------------------------------------------------------------------------------------
>
> >+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
>
> >+;---------------------------------------------------------------------------------------------
> >+INIT_XMM sse4
> >+cglobal intra_pred_dc32_new, 3, 5, 5
> >+ lea r3, [r2 + 65]
> >+ inc r2
> >+ pxor m0, m0
> >+ movu m1, [r2]
> >+ movu m2, [r2 + 16]
> >+ movu m3, [r3]
> >+ movu m4, [r3 + 16]
> >+ psadbw m1, m0
> >+ psadbw m2, m0
> >+ psadbw m3, m0
> >+ psadbw m4, m0
> >+ paddw m1, m2
> >+ paddw m3, m4
> >+ paddw m1, m3
> >+ pshufd m2, m1, 2
> >+ paddw m1, m2
> >+
> >+ movd r4d, m1
> >+ add r4d, 32
> >+ shr r4d, 6 ; sum = sum / 64
> >+ movd m1, r4d
> >+ pshufb m1, m0 ; m1 = byte [dc_val ...]
> >+
> >+%rep 2
> >+ ; store DC 16x16
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ movu [r0 + 16], m1
> >+ movu [r0 + r1 + 16],m1
> >+ lea r0, [r0 + 2 * r1]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ movu [r0 + 16], m1
> >+ movu [r0 + r1 + 16],m1
> >+ lea r0, [r0 + 2 * r1]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ movu [r0 + 16], m1
> >+ movu [r0 + r1 + 16],m1
> >+ lea r0, [r0 + 2 * r1]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ movu [r0 + 16], m1
> >+ movu [r0 + r1 + 16],m1
> >+ lea r0, [r0 + 2 * r1]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ movu [r0 + 16], m1
> >+ movu [r0 + r1 + 16],m1
> >+ lea r0, [r0 + 2 * r1]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ movu [r0 + 16], m1
> >+ movu [r0 + r1 + 16],m1
> >+ lea r0, [r0 + 2 * r1]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ movu [r0 + 16], m1
> >+ movu [r0 + r1 + 16],m1
> >+ lea r0, [r0 + 2 * r1]
> >+ movu [r0], m1
> >+ movu [r0 + r1], m1
> >+ movu [r0 + 16], m1
> >+ movu [r0 + r1 + 16],m1
> >+ lea r0, [r0 + 2 * r1]
> >+%endrep
> >+
> >+ RET
> >+
>
> > ;-----------------------------------------------------------------------------------------------------------
>
> > ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
>
> > ;-----------------------------------------------------------------------------------------------------------
> >diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.cpp
> >--- a/source/test/intrapredharness.cpp Mon Jan 12 12:34:37 2015 +0530
> >+++ b/source/test/intrapredharness.cpp Tue Dec 16 14:02:19 2014 +0530
> >@@ -71,6 +71,38 @@
> > return true;
> > }
> >
>
> >+bool IntraPredHarness::check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width)
> >+{
> >+ int j = Predict::ADI_BUF_STRIDE;
> >+ intptr_t stride = FENC_STRIDE;
> >+
> >+#if _DEBUG
> >+ memset(pixel_out_vec, 0xCD, OUTPUT_SIZE);
> >+ memset(pixel_out_c, 0xCD, OUTPUT_SIZE);
> >+#endif
> >+
> >+ for (int i = 0; i <= 100; i++)
> >+ {
> >+ int rand_filter = rand() & 1;
> >+ if (width > 16)
> >+ rand_filter = 0;
> >+
>
> >+ ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
>
> >+ opt(pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);
> >+
> >+ for (int k = 0; k < width; k++)
> >+ {
>
> >+ if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
> >+ return false;
> >+ }
> >+
> >+ reportfail();
> >+ j += FENC_STRIDE;
> >+ }
> >+
> >+ return true;
> >+}
> >+
>
> > bool IntraPredHarness::check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width)
> > {
> > int j = Predict::ADI_BUF_STRIDE;
> >@@ -222,6 +254,15 @@
> > return false;
> > }
> > }
> >+ if (opt.intra_pred_new[1][i])
> >+ {
> >+ const int size = (1 << (i + 2));
>
> >+ if (!check_dc_primitive(ref.intra_pred_new[1][i], opt.intra_pred_new[1][i], size))
> >+ {
> >+ printf("intra_dc %dx%d failed\n", size, size);
> >+ return false;
> >+ }
> >+ }
> > }
> >
> > // NOTE: always call since this function have check pointer in loop
> >@@ -279,6 +320,18 @@
>
> > REPORT_SPEEDUP(opt.intra_pred_allangs[i], ref.intra_pred_allangs[i],
>
> > pixel_out_33_vec, refAbove, refLeft, refAbove, refLeft, bFilter);
> > }
> >+ if (opt.intra_pred_new[1][i])
> >+ {
> >+ printf("intra_dc_new_%dx%d[f=0]", size, size);
>
> >+ REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
>
> >+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 0);
> >+ if (size <= 16)
> >+ {
> >+ printf("intra_dc_new_%dx%d[f=1]", size, size);
>
> >+ REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],
>
> >+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 1);
> >+ }
> >+ }
> > }
> >
> > for (int ii = 2; ii <= 5; ii++)
> >diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.h
> >--- a/source/test/intrapredharness.h Mon Jan 12 12:34:37 2015 +0530
> >+++ b/source/test/intrapredharness.h Tue Dec 16 14:02:19 2014 +0530
> >@@ -42,6 +42,7 @@
> > pixel pixel_out_33_vec[OUTPUT_SIZE_33];
> >
>
> > bool check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width);
>
> >+ bool check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);
>
> > bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);
>
> > bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]);
>
> > bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150113/bc76ef4b/attachment-0001.html>
More information about the x265-devel
mailing list