<div dir="ltr"><div>Min,<br><br></div>Testbench reports a failure here - <br><pre style="font-size:15px;font-family:Times New Roman">** testbench failure reported for vc11_64_main12::
Testing primitives: AVX2
Testing primitives: BMI2
planecopy_sp_shl failed
x265: asm primitive has failed. Go and fix that Right Now!
return code -1
<br></pre><pre style="font-size:15px;font-family:Times New Roman"><br></pre><br></div><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Dec 31, 2015 at 5:33 AM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1451520182 21600<br>
# Node ID 717cb31ed9931513bb0851f0e6c68af868b5ad45<br>
# Parent 75d1c62d8f0c517dda37ac89f401faa308d60f24<br>
asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223)<br>
---<br>
source/common/x86/pixel-a.asm | 327 ++++++++++-------------------------------<br>
source/test/pixelharness.cpp | 25 +++-<br>
2 files changed, 103 insertions(+), 249 deletions(-)<br>
<br>
diff -r 75d1c62d8f0c -r 717cb31ed993 source/common/x86/pixel-a.asm<br>
--- a/source/common/x86/pixel-a.asm Thu Dec 24 13:58:32 2015 +0530<br>
+++ b/source/common/x86/pixel-a.asm Wed Dec 30 18:03:02 2015 -0600<br>
@@ -8154,92 +8154,57 @@<br>
;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)<br>
;------------------------------------------------------------------------------------------------------------------------<br>
INIT_XMM sse2<br>
-cglobal downShift_16, 7,7,3<br>
- movd m0, r6d ; m0 = shift<br>
+cglobal downShift_16, 4,7,3<br>
+ mov r4d, r4m<br>
+ mov r5d, r5m<br>
+ movd m0, r6m ; m0 = shift<br>
add r1, r1<br>
+<br>
dec r5d<br>
.loopH:<br>
xor r6, r6<br>
+<br>
.loopW:<br>
movu m1, [r0 + r6 * 2]<br>
- movu m2, [r0 + r6 * 2 + 16]<br>
+ movu m2, [r0 + r6 * 2 + mmsize]<br>
psrlw m1, m0<br>
psrlw m2, m0<br>
packuswb m1, m2<br>
movu [r2 + r6], m1<br>
<br>
- add r6, 16<br>
+ add r6, mmsize<br>
cmp r6d, r4d<br>
- jl .loopW<br>
+ jl .loopW<br>
<br>
; move to next row<br>
add r0, r1<br>
add r2, r3<br>
dec r5d<br>
- jnz .loopH<br>
-<br>
-;processing last row of every frame [To handle width which not a multiple of 16]<br>
-<br>
+ jnz .loopH<br>
+<br>
+ ;processing last row of every frame [To handle width which not a multiple of 16]<br>
+ ; r4d must be more than or equal to 16(mmsize)<br>
.loop16:<br>
+ movu m1, [r0 + (r4 - mmsize) * 2]<br>
+ movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]<br>
+ psrlw m1, m0<br>
+ psrlw m2, m0<br>
+ packuswb m1, m2<br>
+ movu [r2 + r4 - mmsize], m1<br>
+<br>
+ sub r4d, mmsize<br>
+ jz .end<br>
+ cmp r4d, mmsize<br>
+ jge .loop16<br>
+<br>
+ ; process partial pixels<br>
movu m1, [r0]<br>
- movu m2, [r0 + 16]<br>
+ movu m2, [r0 + mmsize]<br>
psrlw m1, m0<br>
psrlw m2, m0<br>
packuswb m1, m2<br>
movu [r2], m1<br>
<br>
- add r0, 2 * mmsize<br>
- add r2, mmsize<br>
- sub r4d, 16<br>
- jz .end<br>
- cmp r4d, 15<br>
- jg .loop16<br>
-<br>
- cmp r4d, 8<br>
- jl .process4<br>
- movu m1, [r0]<br>
- psrlw m1, m0<br>
- packuswb m1, m1<br>
- movh [r2], m1<br>
-<br>
- add r0, mmsize<br>
- add r2, 8<br>
- sub r4d, 8<br>
- jz .end<br>
-<br>
-.process4:<br>
- cmp r4d, 4<br>
- jl .process2<br>
- movh m1,[r0]<br>
- psrlw m1, m0<br>
- packuswb m1, m1<br>
- movd [r2], m1<br>
-<br>
- add r0, 8<br>
- add r2, 4<br>
- sub r4d, 4<br>
- jz .end<br>
-<br>
-.process2:<br>
- cmp r4d, 2<br>
- jl .process1<br>
- movd m1, [r0]<br>
- psrlw m1, m0<br>
- packuswb m1, m1<br>
- movd r6, m1<br>
- mov [r2], r6w<br>
-<br>
- add r0, 4<br>
- add r2, 2<br>
- sub r4d, 2<br>
- jz .end<br>
-<br>
-.process1:<br>
- movd m1, [r0]<br>
- psrlw m1, m0<br>
- packuswb m1, m1<br>
- movd r3, m1<br>
- mov [r2], r3b<br>
.end:<br>
RET<br>
<br>
@@ -8248,12 +8213,16 @@<br>
;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)<br>
;-------------------------------------------------------------------------------------------------------------------------------------<br>
INIT_YMM avx2<br>
-cglobal downShift_16, 6,7,3<br>
+cglobal downShift_16, 4,7,3<br>
+ mov r4d, r4m<br>
+ mov r5d, r5m<br>
movd xm0, r6m ; m0 = shift<br>
add r1d, r1d<br>
+<br>
dec r5d<br>
.loopH:<br>
xor r6, r6<br>
+<br>
.loopW:<br>
movu m1, [r0 + r6 * 2 + 0]<br>
movu m2, [r0 + r6 * 2 + 32]<br>
@@ -8265,92 +8234,39 @@<br>
<br>
add r6d, mmsize<br>
cmp r6d, r4d<br>
- jl .loopW<br>
+ jl .loopW<br>
<br>
; move to next row<br>
add r0, r1<br>
add r2, r3<br>
dec r5d<br>
- jnz .loopH<br>
-<br>
-; processing last row of every frame [To handle width which not a multiple of 32]<br>
- mov r6d, r4d<br>
- and r4d, 31<br>
- shr r6d, 5<br>
+ jnz .loopH<br>
+<br>
+ ; processing last row of every frame [To handle width which not a multiple of 32]<br>
<br>
.loop32:<br>
- movu m1, [r0]<br>
- movu m2, [r0 + 32]<br>
+ movu m1, [r0 + (r4 - mmsize) * 2]<br>
+ movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]<br>
psrlw m1, xm0<br>
psrlw m2, xm0<br>
packuswb m1, m2<br>
- vpermq m1, m1, 11011000b<br>
+ vpermq m1, m1, q3120<br>
+ movu [r2 + r4 - mmsize], m1<br>
+<br>
+ sub r4d, mmsize<br>
+ jz .end<br>
+ cmp r4d, mmsize<br>
+ jge .loop32<br>
+<br>
+ ; process partial pixels<br>
+ movu m1, [r0]<br>
+ movu m2, [r0 + mmsize]<br>
+ psrlw m1, xm0<br>
+ psrlw m2, xm0<br>
+ packuswb m1, m2<br>
+ vpermq m1, m1, q3120<br>
movu [r2], m1<br>
<br>
- add r0, 2*mmsize<br>
- add r2, mmsize<br>
- dec r6d<br>
- jnz .loop32<br>
-<br>
- cmp r4d, 16<br>
- jl .process8<br>
- movu m1, [r0]<br>
- psrlw m1, xm0<br>
- packuswb m1, m1<br>
- vpermq m1, m1, 10001000b<br>
- movu [r2], xm1<br>
-<br>
- add r0, mmsize<br>
- add r2, 16<br>
- sub r4d, 16<br>
- jz .end<br>
-<br>
-.process8:<br>
- cmp r4d, 8<br>
- jl .process4<br>
- movu m1, [r0]<br>
- psrlw m1, xm0<br>
- packuswb m1, m1<br>
- movq [r2], xm1<br>
-<br>
- add r0, 16<br>
- add r2, 8<br>
- sub r4d, 8<br>
- jz .end<br>
-<br>
-.process4:<br>
- cmp r4d, 4<br>
- jl .process2<br>
- movq xm1,[r0]<br>
- psrlw m1, xm0<br>
- packuswb m1, m1<br>
- movd [r2], xm1<br>
-<br>
- add r0, 8<br>
- add r2, 4<br>
- sub r4d, 4<br>
- jz .end<br>
-<br>
-.process2:<br>
- cmp r4d, 2<br>
- jl .process1<br>
- movd xm1, [r0]<br>
- psrlw m1, xm0<br>
- packuswb m1, m1<br>
- movd r6d, xm1<br>
- mov [r2], r6w<br>
-<br>
- add r0, 4<br>
- add r2, 2<br>
- sub r4d, 2<br>
- jz .end<br>
-<br>
-.process1:<br>
- movd xm1, [r0]<br>
- psrlw m1, xm0<br>
- packuswb m1, m1<br>
- movd r3d, xm1<br>
- mov [r2], r3b<br>
.end:<br>
RET<br>
<br>
@@ -8487,7 +8403,9 @@<br>
;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)<br>
;------------------------------------------------------------------------------------------------------------------------<br>
INIT_XMM sse2<br>
-cglobal upShift_16, 6,7,4<br>
+cglobal upShift_16, 4,7,4<br>
+ mov r4d, r4m<br>
+ mov r5d, r5m<br>
movd m0, r6m ; m0 = shift<br>
mova m3, [pw_pixel_max]<br>
FIX_STRIDES r1d, r3d<br>
@@ -8515,9 +8433,25 @@<br>
dec r5d<br>
jnz .loopH<br>
<br>
-;processing last row of every frame [To handle width which not a multiple of 16]<br>
-<br>
+ ;processing last row of every frame [To handle width which not a multiple of 16]<br>
+<br>
+ ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here<br>
.loop16:<br>
+ movu m1, [r0 + (r4 - mmsize) * 2]<br>
+ movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]<br>
+ psllw m1, m0<br>
+ psllw m2, m0<br>
+ pand m1, m3<br>
+ pand m2, m3<br>
+ movu [r2 + (r4 - mmsize) * 2], m1<br>
+ movu [r2 + (r4 - mmsize) * 2 + mmsize], m2<br>
+<br>
+ sub r4d, mmsize<br>
+ jz .end<br>
+ cmp r4d, mmsize<br>
+ jge .loop16<br>
+<br>
+ ; process partial pixels<br>
movu m1, [r0]<br>
movu m2, [r0 + mmsize]<br>
psllw m1, m0<br>
@@ -8527,56 +8461,6 @@<br>
movu [r2], m1<br>
movu [r2 + mmsize], m2<br>
<br>
- add r0, 2 * mmsize<br>
- add r2, 2 * mmsize<br>
- sub r4d, 16<br>
- jz .end<br>
- jg .loop16<br>
-<br>
- cmp r4d, 8<br>
- jl .process4<br>
- movu m1, [r0]<br>
- psrlw m1, m0<br>
- pand m1, m3<br>
- movu [r2], m1<br>
-<br>
- add r0, mmsize<br>
- add r2, mmsize<br>
- sub r4d, 8<br>
- jz .end<br>
-<br>
-.process4:<br>
- cmp r4d, 4<br>
- jl .process2<br>
- movh m1,[r0]<br>
- psllw m1, m0<br>
- pand m1, m3<br>
- movh [r2], m1<br>
-<br>
- add r0, 8<br>
- add r2, 8<br>
- sub r4d, 4<br>
- jz .end<br>
-<br>
-.process2:<br>
- cmp r4d, 2<br>
- jl .process1<br>
- movd m1, [r0]<br>
- psllw m1, m0<br>
- pand m1, m3<br>
- movd [r2], m1<br>
-<br>
- add r0, 4<br>
- add r2, 4<br>
- sub r4d, 2<br>
- jz .end<br>
-<br>
-.process1:<br>
- movd m1, [r0]<br>
- psllw m1, m0<br>
- pand m1, m3<br>
- movd r3, m1<br>
- mov [r2], r3w<br>
.end:<br>
RET<br>
<br>
@@ -8584,9 +8468,10 @@<br>
;-------------------------------------------------------------------------------------------------------------------------------------<br>
;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)<br>
;-------------------------------------------------------------------------------------------------------------------------------------<br>
-; TODO: NO TEST CODE!<br>
INIT_YMM avx2<br>
-cglobal upShift_16, 6,7,4<br>
+cglobal upShift_16, 4,7,4<br>
+ mov r4d, r4m<br>
+ mov r5d, r5m<br>
movd xm0, r6m ; m0 = shift<br>
vbroadcasti128 m3, [pw_pixel_max]<br>
FIX_STRIDES r1d, r3d<br>
@@ -8613,83 +8498,33 @@<br>
dec r5d<br>
jnz .loopH<br>
<br>
-; processing last row of every frame [To handle width which not a multiple of 32]<br>
- mov r6d, r4d<br>
- and r4d, 31<br>
- shr r6d, 5<br>
+ ; processing last row of every frame [To handle width which not a multiple of 32]<br>
<br>
.loop32:<br>
+ movu m1, [r0 + (r4 - mmsize) * 2]<br>
+ movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]<br>
+ psllw m1, xm0<br>
+ psllw m2, xm0<br>
+ pand m1, m3<br>
+ pand m2, m3<br>
+ movu [r2 + (r4 - mmsize) * 2], m1<br>
+ movu [r2 + (r4 - mmsize) * 2 + mmsize], m2<br>
+<br>
+ sub r4d, mmsize<br>
+ jz .end<br>
+ cmp r4d, mmsize<br>
+ jge .loop32<br>
+<br>
+ ; process partial pixels<br>
movu m1, [r0]<br>
- movu m2, [r0 + mmsize]<br>
+ movu m2, [r0]<br>
psllw m1, xm0<br>
psllw m2, xm0<br>
pand m1, m3<br>
pand m2, m3<br>
movu [r2], m1<br>
- movu [r2 + mmsize], m2<br>
-<br>
- add r0, 2*mmsize<br>
- add r2, 2*mmsize<br>
- dec r6d<br>
- jnz .loop32<br>
-<br>
- cmp r4d, 16<br>
- jl .process8<br>
- movu m1, [r0]<br>
- psllw m1, xm0<br>
- pand m1, m3<br>
- movu [r2], m1<br>
-<br>
- add r0, mmsize<br>
- add r2, mmsize<br>
- sub r4d, 16<br>
- jz .end<br>
-<br>
-.process8:<br>
- cmp r4d, 8<br>
- jl .process4<br>
- movu xm1, [r0]<br>
- psllw xm1, xm0<br>
- pand xm1, xm3<br>
- movu [r2], xm1<br>
-<br>
- add r0, 16<br>
- add r2, 16<br>
- sub r4d, 8<br>
- jz .end<br>
-<br>
-.process4:<br>
- cmp r4d, 4<br>
- jl .process2<br>
- movq xm1,[r0]<br>
- psllw xm1, xm0<br>
- pand xm1, xm3<br>
- movq [r2], xm1<br>
-<br>
- add r0, 8<br>
- add r2, 8<br>
- sub r4d, 4<br>
- jz .end<br>
-<br>
-.process2:<br>
- cmp r4d, 2<br>
- jl .process1<br>
- movd xm1, [r0]<br>
- psllw xm1, xm0<br>
- pand xm1, xm3<br>
- movd [r2], xm1<br>
-<br>
- add r0, 4<br>
- add r2, 4<br>
- sub r4d, 2<br>
- jz .end<br>
-<br>
-.process1:<br>
- movd xm1, [r0]<br>
- psllw xm1, xm0<br>
- pand xm1, xm3<br>
- movd r3d, xm1<br>
- mov [r2], r3w<br>
+ movu [r2], m2<br>
+<br>
.end:<br>
RET<br>
<br>
diff -r 75d1c62d8f0c -r 717cb31ed993 source/test/pixelharness.cpp<br>
--- a/source/test/pixelharness.cpp Thu Dec 24 13:58:32 2015 +0530<br>
+++ b/source/test/pixelharness.cpp Wed Dec 30 18:03:02 2015 -0600<br>
@@ -1299,8 +1299,8 @@<br>
<br>
memset(ref_dest, 0xCD, sizeof(ref_dest));<br>
memset(opt_dest, 0xCD, sizeof(opt_dest));<br>
- int width = 32 + rand() % 32;<br>
- int height = 32 + rand() % 32;<br>
+ int width = 32 + (rand() % 32);<br>
+ int height = 32 + (rand() % 32);<br>
intptr_t srcStride = 64;<br>
intptr_t dstStride = width;<br>
int j = 0;<br>
@@ -1308,11 +1308,23 @@<br>
for (int i = 0; i < ITERS; i++)<br>
{<br>
int index = i % TEST_CASES;<br>
+<br>
checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));<br>
ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));<br>
<br>
- if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))<br>
+ if (memcmp(ref_dest, opt_dest, dstStride * height * sizeof(pixel)))<br>
+ {<br>
+ memcpy(opt_dest, ref_dest, sizeof(ref_dest));<br>
+ opt(ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));<br>
return false;<br>
+ }<br>
+<br>
+ // check tail memory area<br>
+ for(int x = width; x < dstStride; x++)<br>
+ {<br>
+ if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)<br>
+ return false;<br>
+ }<br>
<br>
reportfail();<br>
j += INCR;<br>
@@ -1344,6 +1356,13 @@<br>
if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))<br>
return false;<br>
<br>
+ // check tail memory area<br>
+ for(int x = width; x < dstStride; x++)<br>
+ {<br>
+ if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)<br>
+ return false;<br>
+ }<br>
+<br>
reportfail();<br>
j += INCR;<br>
}<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><br>-- <br><div class="gmail_signature"><div dir="ltr"><div><div>Deepthi Nandakumar<br></div>Engineering Manager, x265<br></div>Multicoreware, Inc<br></div></div>
</div>