<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div><br><br>At 2015-01-13 15:11:14,dnyaneshwar@multicorewareinc.com wrote:<br>># HG changeset patch<br>># User Dnyaneshwar G <dnyaneshwar@multicorewareinc.com><br>># Date 1418718739 -19800<br>># Tue Dec 16 14:02:19 2014 +0530<br>># Node ID 70b4e0c84320df0b7443e5aea6e110c1bf483684<br>># Parent f4daa8744d08b569ae652737c4506b397dfb55cb<br>>intra_pred_dc_new: updated asm and unit test code<br>><br>>+;---------------------------------------------------------------------------------------------<br>>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)<br>>+;---------------------------------------------------------------------------------------------<br>>+INIT_XMM sse4<br>>+cglobal intra_pred_dc4_new, 5,5,3<br>>+ inc r2<br>>+ pxor m0, m0<br>>+ movd m1, [r2]<br>>+ movd m2, [r2 + 8]<br>>+ punpckldq m1, m2<br>>+ psadbw m1, m0 ; m1 = sum<br>>+<br>>+ test r4d, r4d<br>>+<br>>+ mov r4d, 4096<br>>+ movd m2, r4d<br>>+ pmulhrsw m1, m2 ; m1 = (sum + 4) / 8<br>there have a constant pw_4096<br><br>>+ movd r4d, m1 ; r4d = dc_val<br>>+ pshufb m1, m0 ; m1 = byte [dc_val ...]<br>>+<br>>+ ; store DC 4x4<br>>+ lea r3, [r1 * 3]<br>>+ movd [r0], m1<br>>+ movd [r0 + r1], m1<br>>+ movd [r0 + r1 * 2], m1<br>>+ movd [r0 + r3], m1<br>>+<br>>+ ; do DC filter<br>>+ jz .end<br>>+ lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2<br>>+ add r4d, r3d ; r4d = DC * 3 + 2<br>>+ movd m1, r4d</div>
<div>>+ pshuflw m1, m1, 0 ; m1 = pixDCx3<br>we have more free register, so we can keep dc in register to reduce operators</div>
<div> </div>
<div>>+<br>>+ ; filter top<br>>+ pmovzxbw m2, [r2]<br>>+ paddw m2, m1<br>>+ psraw m2, 2</div>
<div>combo above modify, we can make (x + 2)>>2 with pmulhrsw</div>
<div><br>>+ packuswb m2, m2<br>>+ movd [r0], m2 ; overwrite top-left pixel, we will update it later<br>>+<br>>+ ; filter top-left<br>>+ movzx r4d, byte [r2 + 8]<br>>+ add r3d, r4d<br>>+ movzx r4d, byte [r2]<br>>+ add r4d, r3d<br>>+ shr r4d, 2<br>>+ mov [r0], r4b<br>>+<br>>+ ; filter left<br>>+ add r0, r1<br>>+ pmovzxbw m2, [r2 + 9]<br>>+ paddw m2, m1<br>>+ psraw m2, 2<br>>+ packuswb m2, m2<br>>+ pextrb [r0], m2, 0<br>>+ pextrb [r0 + r1], m2, 1<br>>+ pextrb [r0 + r1 * 2], m2, 2<br>>+<br>>+.end:<br>>+ RET<br>> <br>> ;-------------------------------------------------------------------------------------------<br>> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)<br>>@@ -217,6 +278,85 @@<br>> .end:<br>> RET<br>> <br>>+;---------------------------------------------------------------------------------------------<br>>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)<br>>+;---------------------------------------------------------------------------------------------<br>>+INIT_XMM sse4<br>>+cglobal intra_pred_dc8_new, 5, 7, 3<br>>+ lea r3, [r2 + 17]<br>>+ inc r2<br>>+ pxor m0, m0<br>>+ movh m1, [r2]<br>>+ movh m2, [r3]<br>>+ punpcklqdq m1, m2</div>
<div>how about movhps?</div>
<div><br>>+ psadbw m1, m0<br>>+ pshufd m2, m1, 2<br>>+ paddw m1, m2</div>
<div>paddd ?</div>
<div><br>>+<br>>+ movd r5d, m1<br>>+ add r5d, 8<br>>+ shr r5d, 4 ; sum = sum / 16</div>
<div>same as 4x4, we can reduce convert between general and SSE register</div>
<div><br>>+ movd m1, r5d<br>>+ pshufb m1, m0 ; m1 = byte [dc_val ...]<br>>+<br>>+ test r4d, r4d<br>>+<br>>+ ; store DC 8x8<br>>+ mov r6, r0<br>>+ movh [r0], m1<br>>+ movh [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movh [r0], m1<br>>+ movh [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movh [r0], m1<br>>+ movh [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movh [r0], m1<br>>+ movh [r0 + r1], m1<br>>+<br>>+ ; Do DC Filter<br>>+ jz .end<br>>+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2<br>>+ add r5d, r4d ; r5d = DC * 3 + 2<br>>+ movd m1, r5d<br>>+ pshuflw m1, m1, 0 ; m1 = pixDCx3<br>>+ pshufd m1, m1, 0<br>>+<br>>+ ; filter top<br>>+ pmovzxbw m2, [r2]<br>>+ paddw m2, m1<br>>+ psraw m2, 2<br>>+ packuswb m2, m2<br>>+ movh [r6], m2<br>>+<br>>+ ; filter top-left<br>>+ movzx r5d, byte [r3]<br>>+ add r4d, r5d<br>>+ movzx r5d, byte [r2]<br>>+ add r5d, r4d<br>>+ shr r5d, 2<br>>+ mov [r6], r5b<br>>+<br>>+ ; filter left<br>>+ add r6, r1<br>>+ pmovzxbw m2, [r3 + 1]<br>>+ paddw m2, m1<br>>+ psraw m2, 2<br>>+ packuswb m2, m2<br>>+ pextrb [r6], m2, 0<br>>+ pextrb [r6 + r1], m2, 1<br>>+ pextrb [r6 + 2 * r1], m2, 2<br>>+ lea r6, [r6 + r1 * 2]<br>>+ pextrb [r6 + r1], m2, 3<br>>+ pextrb [r6 + r1 * 2], m2, 4<br>>+ pextrb [r6 + r1 * 4], m2, 6<br>>+ lea r1, [r1 * 3]</div>
<div>how about reuse r0 or others to reduce above lea_r6</div>
<div><br>>+ pextrb [r6 + r1], m2, 5<br>>+<br>>+.end:<br>>+ RET<br>>+<br>> ;-------------------------------------------------------------------------------------------<br>> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)<br>> ;-------------------------------------------------------------------------------------------<br>>@@ -332,6 +472,120 @@<br>> .end:<br>> RET<br>> <br>>+;--------------------------------------------------------------------------------------------<br>>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)<br>>+;--------------------------------------------------------------------------------------------<br>>+INIT_XMM sse4<br>>+cglobal intra_pred_dc16_new, 5, 7, 4<br>>+ lea r3, [r2 + 33]<br>>+ inc r2<br>>+ pxor m0, m0<br>>+ movu m1, [r2]<br>>+ movu m2, [r3]<br>>+ psadbw m1, m0<br>>+ psadbw m2, m0<br>>+ paddw m1, m2<br>>+ pshufd m2, m1, 2<br>>+ paddw m1, m2<br>>+<br>>+ movd r5d, m1<br>>+ add r5d, 16<br>>+ shr r5d, 5 ; sum = sum / 32<br>>+ movd m1, r5d<br>>+ pshufb m1, m0 ; m1 = byte [dc_val ...]<br>>+<br>>+ test r4d, r4d<br>>+<br>>+ ; store DC 16x16<br>>+ mov r6, r0<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ lea r0, [r0 + r1 * 2]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+<br>>+ ; Do DC Filter<br>>+ jz .end<br>>+ lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2<br>>+ add r5d, r4d ; r5d = DC * 3 + 2<br>>+ movd m1, r5d<br>>+ pshuflw m1, m1, 0 ; m1 = pixDCx3<br>>+ pshufd m1, m1, 0<br>>+<br>>+ ; filter top<br>>+ pmovzxbw m2, [r2]<br>>+ paddw m2, m1<br>>+ psraw m2, 2<br>>+ packuswb m2, m2<br>>+ movh [r6], m2<br>>+ pmovzxbw m3, [r2 + 8]<br>>+ paddw m3, m1<br>>+ psraw m3, 2<br>>+ packuswb m3, m3<br>>+ movh [r6 + 8], m3<br>>+<br>>+ ; filter top-left<br>>+ movzx r5d, byte [r3]<br>>+ add r4d, r5d<br>>+ movzx r5d, byte [r2]<br>>+ add r5d, r4d<br>>+ shr r5d, 2<br>>+ mov [r6], r5b<br>>+<br>>+ ; filter left<br>>+ add r6, r1<br>>+ pmovzxbw m2, [r3 + 1]<br>>+ paddw m2, m1<br>>+ psraw m2, 2<br>>+ packuswb m2, m2<br>>+ pextrb [r6], m2, 0<br>>+ pextrb [r6 + r1], m2, 1<br>>+ pextrb [r6 + r1 * 2], m2, 2<br>>+ lea r6, [r6 + r1 * 2]<br>>+ pextrb [r6 + r1], m2, 3<br>>+ pextrb [r6 + r1 * 2], m2, 4<br>>+ lea r6, [r6 + r1 * 2]<br>>+ pextrb [r6 + r1], m2, 5<br>>+ pextrb [r6 + r1 * 2], m2, 6<br>>+ lea r6, [r6 + r1 * 2]<br>>+ pextrb [r6 + r1], m2, 7<br>>+<br>>+ pmovzxbw m3, [r3 + 9]<br>>+ paddw m3, m1<br>>+ psraw m3, 2<br>>+ packuswb m3, m3<br>>+ pextrb [r6 + r1 * 2], m3, 0<br>>+ lea r6, [r6 + r1 * 2]<br>>+ pextrb [r6 + r1], m3, 1<br>>+ pextrb [r6 + r1 * 2], m3, 2<br>>+ lea r6, [r6 + r1 * 2]<br>>+ pextrb [r6 + r1], m3, 3<br>>+ pextrb [r6 + r1 * 2], m3, 4<br>>+ lea r6, [r6 + r1 * 2]<br>>+ pextrb [r6 + r1], m3, 5<br>>+ pextrb [r6 + r1 * 2], m3, 6<br>>+<br>>+.end:<br>>+ RET<br>>+<br>> ;-------------------------------------------------------------------------------------------<br>> ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)<br>> ;-------------------------------------------------------------------------------------------<br>>@@ -406,6 +660,80 @@<br>> <br>> RET<br>> <br>>+;---------------------------------------------------------------------------------------------<br>>+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)<br>>+;---------------------------------------------------------------------------------------------<br>>+INIT_XMM sse4<br>>+cglobal intra_pred_dc32_new, 3, 5, 5<br>>+ lea r3, [r2 + 65]<br>>+ inc r2<br>>+ pxor m0, m0<br>>+ movu m1, [r2]<br>>+ movu m2, [r2 + 16]<br>>+ movu m3, [r3]<br>>+ movu m4, [r3 + 16]<br>>+ psadbw m1, m0<br>>+ psadbw m2, m0<br>>+ psadbw m3, m0<br>>+ psadbw m4, m0<br>>+ paddw m1, m2<br>>+ paddw m3, m4<br>>+ paddw m1, m3<br>>+ pshufd m2, m1, 2<br>>+ paddw m1, m2<br>>+<br>>+ movd r4d, m1<br>>+ add r4d, 32<br>>+ shr r4d, 6 ; sum = sum / 64<br>>+ movd m1, r4d<br>>+ pshufb m1, m0 ; m1 = byte [dc_val ...]<br>>+<br>>+%rep 2<br>>+ ; store DC 16x16<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ movu [r0 + 16], m1<br>>+ movu [r0 + r1 + 16],m1<br>>+ lea r0, [r0 + 2 * r1]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ movu [r0 + 16], m1<br>>+ movu [r0 + r1 + 16],m1<br>>+ lea r0, [r0 + 2 * r1]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ movu [r0 + 16], m1<br>>+ movu [r0 + r1 + 16],m1<br>>+ lea r0, [r0 + 2 * r1]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ movu [r0 + 16], m1<br>>+ movu [r0 + r1 + 16],m1<br>>+ lea r0, [r0 + 2 * r1]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ movu [r0 + 16], m1<br>>+ movu [r0 + r1 + 16],m1<br>>+ lea r0, [r0 + 2 * r1]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ movu [r0 + 16], m1<br>>+ movu [r0 + r1 + 16],m1<br>>+ lea r0, [r0 + 2 * r1]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ movu [r0 + 16], m1<br>>+ movu [r0 + r1 + 16],m1<br>>+ lea r0, [r0 + 2 * r1]<br>>+ movu [r0], m1<br>>+ movu [r0 + r1], m1<br>>+ movu [r0 + 16], m1<br>>+ movu [r0 + r1 + 16],m1<br>>+ lea r0, [r0 + 2 * r1]<br>>+%endrep<br>>+<br>>+ RET<br>>+<br>> ;-----------------------------------------------------------------------------------------------------------<br>> ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)<br>> ;-----------------------------------------------------------------------------------------------------------<br>>diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.cpp<br>>--- a/source/test/intrapredharness.cpp Mon Jan 12 12:34:37 2015 +0530<br>>+++ b/source/test/intrapredharness.cpp Tue Dec 16 14:02:19 2014 +0530<br>>@@ -71,6 +71,38 @@<br>> return true;<br>> }<br>> <br>>+bool IntraPredHarness::check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width)<br>>+{<br>>+ int j = Predict::ADI_BUF_STRIDE;<br>>+ intptr_t stride = FENC_STRIDE;<br>>+<br>>+#if _DEBUG<br>>+ memset(pixel_out_vec, 0xCD, OUTPUT_SIZE);<br>>+ memset(pixel_out_c, 0xCD, OUTPUT_SIZE);<br>>+#endif<br>>+<br>>+ for (int i = 0; i <= 100; i++)<br>>+ {<br>>+ int rand_filter = rand() & 1;<br>>+ if (width > 16)<br>>+ rand_filter = 0;<br>>+<br>>+ ref(pixel_out_c, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);<br>>+ opt(pixel_out_vec, stride, pixel_buff + j - Predict::ADI_BUF_STRIDE, 0, rand_filter);<br>>+<br>>+ for (int k = 0; k < width; k++)<br>>+ {<br>>+ if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))<br>>+ return false;<br>>+ }<br>>+<br>>+ reportfail();<br>>+ j += FENC_STRIDE;<br>>+ }<br>>+<br>>+ return true;<br>>+}<br>>+<br>> bool IntraPredHarness::check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width)<br>> {<br>> int j = Predict::ADI_BUF_STRIDE;<br>>@@ -222,6 +254,15 @@<br>> return false;<br>> }<br>> }<br>>+ if (opt.intra_pred_new[1][i])<br>>+ {<br>>+ const int size = (1 << (i + 2));<br>>+ if (!check_dc_primitive(ref.intra_pred_new[1][i], opt.intra_pred_new[1][i], size))<br>>+ {<br>>+ printf("intra_dc %dx%d failed\n", size, size);<br>>+ return false;<br>>+ }<br>>+ }<br>> }<br>> <br>> // NOTE: always call since this function have check pointer in loop<br>>@@ -279,6 +320,18 @@<br>> REPORT_SPEEDUP(opt.intra_pred_allangs[i], ref.intra_pred_allangs[i],<br>> pixel_out_33_vec, refAbove, refLeft, refAbove, refLeft, bFilter);<br>> }<br>>+ if (opt.intra_pred_new[1][i])<br>>+ {<br>>+ printf("intra_dc_new_%dx%d[f=0]", size, size);<br>>+ REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],<br>>+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 0);<br>>+ if (size <= 16)<br>>+ {<br>>+ printf("intra_dc_new_%dx%d[f=1]", size, size);<br>>+ REPORT_SPEEDUP(opt.intra_pred_new[1][i], ref.intra_pred_new[1][i],<br>>+ pixel_out_vec, FENC_STRIDE, pixel_buff + srcStride, 0, 1);<br>>+ }<br>>+ }<br>> }<br>> <br>> for (int ii = 2; ii <= 5; ii++)<br>>diff -r f4daa8744d08 -r 70b4e0c84320 source/test/intrapredharness.h<br>>--- a/source/test/intrapredharness.h Mon Jan 12 12:34:37 2015 +0530<br>>+++ b/source/test/intrapredharness.h Tue Dec 16 14:02:19 2014 +0530<br>>@@ -42,6 +42,7 @@<br>> pixel pixel_out_33_vec[OUTPUT_SIZE_33];<br>> <br>> bool check_dc_primitive(intra_pred_t ref, intra_pred_t opt, int width);<br>>+ bool check_dc_primitive(intra_pred_new_t ref, intra_pred_new_t opt, int width);<br>> bool check_planar_primitive(intra_pred_t ref, intra_pred_t opt, int width);<br>> bool check_angular_primitive(const intra_pred_t ref[][NUM_TR_SIZE], const intra_pred_t opt[][NUM_TR_SIZE]);<br>> bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);<br>>_______________________________________________<br>>x265-devel mailing list<br>>x265-devel@videolan.org<br>>https://mailman.videolan.org/listinfo/x265-devel<br></div></div>