[x265] [PATCH] asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223)
chen
chenm003 at 163.com
Thu Dec 31 18:06:30 CET 2015
Sorry, it is merge problem, there have two of typo fault, I sent the patches again.
At 2015-12-31 13:41:43,"Deepthi Nandakumar" <deepthi at multicorewareinc.com> wrote:
Min,
Testbench reports a failure here -
** testbench failure reported for vc11_64_main12::
Testing primitives: AVX2
Testing primitives: BMI2
planecopy_sp_shl failed
x265: asm primitive has failed. Go and fix that Right Now!
return code -1
On Thu, Dec 31, 2015 at 5:33 AM, Min Chen <chenm003 at 163.com> wrote:
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1451520182 21600
# Node ID 717cb31ed9931513bb0851f0e6c68af868b5ad45
# Parent 75d1c62d8f0c517dda37ac89f401faa308d60f24
asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223)
---
source/common/x86/pixel-a.asm | 327 ++++++++++-------------------------------
source/test/pixelharness.cpp | 25 +++-
2 files changed, 103 insertions(+), 249 deletions(-)
diff -r 75d1c62d8f0c -r 717cb31ed993 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Dec 24 13:58:32 2015 +0530
+++ b/source/common/x86/pixel-a.asm Wed Dec 30 18:03:02 2015 -0600
@@ -8154,92 +8154,57 @@
;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;------------------------------------------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal downShift_16, 7,7,3
- movd m0, r6d ; m0 = shift
+cglobal downShift_16, 4,7,3
+ mov r4d, r4m
+ mov r5d, r5m
+ movd m0, r6m ; m0 = shift
add r1, r1
+
dec r5d
.loopH:
xor r6, r6
+
.loopW:
movu m1, [r0 + r6 * 2]
- movu m2, [r0 + r6 * 2 + 16]
+ movu m2, [r0 + r6 * 2 + mmsize]
psrlw m1, m0
psrlw m2, m0
packuswb m1, m2
movu [r2 + r6], m1
- add r6, 16
+ add r6, mmsize
cmp r6d, r4d
- jl .loopW
+ jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
- jnz .loopH
-
-;processing last row of every frame [To handle width which not a multiple of 16]
-
+ jnz .loopH
+
+ ;processing last row of every frame [To handle width which not a multiple of 16]
+ ; r4d must be more than or equal to 16(mmsize)
.loop16:
+ movu m1, [r0 + (r4 - mmsize) * 2]
+ movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]
+ psrlw m1, m0
+ psrlw m2, m0
+ packuswb m1, m2
+ movu [r2 + r4 - mmsize], m1
+
+ sub r4d, mmsize
+ jz .end
+ cmp r4d, mmsize
+ jge .loop16
+
+ ; process partial pixels
movu m1, [r0]
- movu m2, [r0 + 16]
+ movu m2, [r0 + mmsize]
psrlw m1, m0
psrlw m2, m0
packuswb m1, m2
movu [r2], m1
- add r0, 2 * mmsize
- add r2, mmsize
- sub r4d, 16
- jz .end
- cmp r4d, 15
- jg .loop16
-
- cmp r4d, 8
- jl .process4
- movu m1, [r0]
- psrlw m1, m0
- packuswb m1, m1
- movh [r2], m1
-
- add r0, mmsize
- add r2, 8
- sub r4d, 8
- jz .end
-
-.process4:
- cmp r4d, 4
- jl .process2
- movh m1,[r0]
- psrlw m1, m0
- packuswb m1, m1
- movd [r2], m1
-
- add r0, 8
- add r2, 4
- sub r4d, 4
- jz .end
-
-.process2:
- cmp r4d, 2
- jl .process1
- movd m1, [r0]
- psrlw m1, m0
- packuswb m1, m1
- movd r6, m1
- mov [r2], r6w
-
- add r0, 4
- add r2, 2
- sub r4d, 2
- jz .end
-
-.process1:
- movd m1, [r0]
- psrlw m1, m0
- packuswb m1, m1
- movd r3, m1
- mov [r2], r3b
.end:
RET
@@ -8248,12 +8213,16 @@
;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;-------------------------------------------------------------------------------------------------------------------------------------
INIT_YMM avx2
-cglobal downShift_16, 6,7,3
+cglobal downShift_16, 4,7,3
+ mov r4d, r4m
+ mov r5d, r5m
movd xm0, r6m ; m0 = shift
add r1d, r1d
+
dec r5d
.loopH:
xor r6, r6
+
.loopW:
movu m1, [r0 + r6 * 2 + 0]
movu m2, [r0 + r6 * 2 + 32]
@@ -8265,92 +8234,39 @@
add r6d, mmsize
cmp r6d, r4d
- jl .loopW
+ jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
- jnz .loopH
-
-; processing last row of every frame [To handle width which not a multiple of 32]
- mov r6d, r4d
- and r4d, 31
- shr r6d, 5
+ jnz .loopH
+
+ ; processing last row of every frame [To handle width which not a multiple of 32]
.loop32:
- movu m1, [r0]
- movu m2, [r0 + 32]
+ movu m1, [r0 + (r4 - mmsize) * 2]
+ movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]
psrlw m1, xm0
psrlw m2, xm0
packuswb m1, m2
- vpermq m1, m1, 11011000b
+ vpermq m1, m1, q3120
+ movu [r2 + r4 - mmsize], m1
+
+ sub r4d, mmsize
+ jz .end
+ cmp r4d, mmsize
+ jge .loop32
+
+ ; process partial pixels
+ movu m1, [r0]
+ movu m2, [r0 + mmsize]
+ psrlw m1, xm0
+ psrlw m2, xm0
+ packuswb m1, m2
+ vpermq m1, m1, q3120
movu [r2], m1
- add r0, 2*mmsize
- add r2, mmsize
- dec r6d
- jnz .loop32
-
- cmp r4d, 16
- jl .process8
- movu m1, [r0]
- psrlw m1, xm0
- packuswb m1, m1
- vpermq m1, m1, 10001000b
- movu [r2], xm1
-
- add r0, mmsize
- add r2, 16
- sub r4d, 16
- jz .end
-
-.process8:
- cmp r4d, 8
- jl .process4
- movu m1, [r0]
- psrlw m1, xm0
- packuswb m1, m1
- movq [r2], xm1
-
- add r0, 16
- add r2, 8
- sub r4d, 8
- jz .end
-
-.process4:
- cmp r4d, 4
- jl .process2
- movq xm1,[r0]
- psrlw m1, xm0
- packuswb m1, m1
- movd [r2], xm1
-
- add r0, 8
- add r2, 4
- sub r4d, 4
- jz .end
-
-.process2:
- cmp r4d, 2
- jl .process1
- movd xm1, [r0]
- psrlw m1, xm0
- packuswb m1, m1
- movd r6d, xm1
- mov [r2], r6w
-
- add r0, 4
- add r2, 2
- sub r4d, 2
- jz .end
-
-.process1:
- movd xm1, [r0]
- psrlw m1, xm0
- packuswb m1, m1
- movd r3d, xm1
- mov [r2], r3b
.end:
RET
@@ -8487,7 +8403,9 @@
;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;------------------------------------------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal upShift_16, 6,7,4
+cglobal upShift_16, 4,7,4
+ mov r4d, r4m
+ mov r5d, r5m
movd m0, r6m ; m0 = shift
mova m3, [pw_pixel_max]
FIX_STRIDES r1d, r3d
@@ -8515,9 +8433,25 @@
dec r5d
jnz .loopH
-;processing last row of every frame [To handle width which not a multiple of 16]
-
+ ;processing last row of every frame [To handle width which not a multiple of 16]
+
+ ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here
.loop16:
+ movu m1, [r0 + (r4 - mmsize) * 2]
+ movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ pand m1, m3
+ pand m2, m3
+ movu [r2 + (r4 - mmsize) * 2], m1
+ movu [r2 + (r4 - mmsize) * 2 + mmsize], m2
+
+ sub r4d, mmsize
+ jz .end
+ cmp r4d, mmsize
+ jge .loop16
+
+ ; process partial pixels
movu m1, [r0]
movu m2, [r0 + mmsize]
psllw m1, m0
@@ -8527,56 +8461,6 @@
movu [r2], m1
movu [r2 + mmsize], m2
- add r0, 2 * mmsize
- add r2, 2 * mmsize
- sub r4d, 16
- jz .end
- jg .loop16
-
- cmp r4d, 8
- jl .process4
- movu m1, [r0]
- psrlw m1, m0
- pand m1, m3
- movu [r2], m1
-
- add r0, mmsize
- add r2, mmsize
- sub r4d, 8
- jz .end
-
-.process4:
- cmp r4d, 4
- jl .process2
- movh m1,[r0]
- psllw m1, m0
- pand m1, m3
- movh [r2], m1
-
- add r0, 8
- add r2, 8
- sub r4d, 4
- jz .end
-
-.process2:
- cmp r4d, 2
- jl .process1
- movd m1, [r0]
- psllw m1, m0
- pand m1, m3
- movd [r2], m1
-
- add r0, 4
- add r2, 4
- sub r4d, 2
- jz .end
-
-.process1:
- movd m1, [r0]
- psllw m1, m0
- pand m1, m3
- movd r3, m1
- mov [r2], r3w
.end:
RET
@@ -8584,9 +8468,10 @@
;-------------------------------------------------------------------------------------------------------------------------------------
;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;-------------------------------------------------------------------------------------------------------------------------------------
-; TODO: NO TEST CODE!
INIT_YMM avx2
-cglobal upShift_16, 6,7,4
+cglobal upShift_16, 4,7,4
+ mov r4d, r4m
+ mov r5d, r5m
movd xm0, r6m ; m0 = shift
vbroadcasti128 m3, [pw_pixel_max]
FIX_STRIDES r1d, r3d
@@ -8613,83 +8498,33 @@
dec r5d
jnz .loopH
-; processing last row of every frame [To handle width which not a multiple of 32]
- mov r6d, r4d
- and r4d, 31
- shr r6d, 5
+ ; processing last row of every frame [To handle width which not a multiple of 32]
.loop32:
+ movu m1, [r0 + (r4 - mmsize) * 2]
+ movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ pand m1, m3
+ pand m2, m3
+ movu [r2 + (r4 - mmsize) * 2], m1
+ movu [r2 + (r4 - mmsize) * 2 + mmsize], m2
+
+ sub r4d, mmsize
+ jz .end
+ cmp r4d, mmsize
+ jge .loop32
+
+ ; process partial pixels
movu m1, [r0]
- movu m2, [r0 + mmsize]
+ movu m2, [r0]
psllw m1, xm0
psllw m2, xm0
pand m1, m3
pand m2, m3
movu [r2], m1
- movu [r2 + mmsize], m2
-
- add r0, 2*mmsize
- add r2, 2*mmsize
- dec r6d
- jnz .loop32
-
- cmp r4d, 16
- jl .process8
- movu m1, [r0]
- psllw m1, xm0
- pand m1, m3
- movu [r2], m1
-
- add r0, mmsize
- add r2, mmsize
- sub r4d, 16
- jz .end
-
-.process8:
- cmp r4d, 8
- jl .process4
- movu xm1, [r0]
- psllw xm1, xm0
- pand xm1, xm3
- movu [r2], xm1
-
- add r0, 16
- add r2, 16
- sub r4d, 8
- jz .end
-
-.process4:
- cmp r4d, 4
- jl .process2
- movq xm1,[r0]
- psllw xm1, xm0
- pand xm1, xm3
- movq [r2], xm1
-
- add r0, 8
- add r2, 8
- sub r4d, 4
- jz .end
-
-.process2:
- cmp r4d, 2
- jl .process1
- movd xm1, [r0]
- psllw xm1, xm0
- pand xm1, xm3
- movd [r2], xm1
-
- add r0, 4
- add r2, 4
- sub r4d, 2
- jz .end
-
-.process1:
- movd xm1, [r0]
- psllw xm1, xm0
- pand xm1, xm3
- movd r3d, xm1
- mov [r2], r3w
+ movu [r2], m2
+
.end:
RET
diff -r 75d1c62d8f0c -r 717cb31ed993 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Thu Dec 24 13:58:32 2015 +0530
+++ b/source/test/pixelharness.cpp Wed Dec 30 18:03:02 2015 -0600
@@ -1299,8 +1299,8 @@
memset(ref_dest, 0xCD, sizeof(ref_dest));
memset(opt_dest, 0xCD, sizeof(opt_dest));
- int width = 32 + rand() % 32;
- int height = 32 + rand() % 32;
+ int width = 32 + (rand() % 32);
+ int height = 32 + (rand() % 32);
intptr_t srcStride = 64;
intptr_t dstStride = width;
int j = 0;
@@ -1308,11 +1308,23 @@
for (int i = 0; i < ITERS; i++)
{
int index = i % TEST_CASES;
+
checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
- if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))
+ if (memcmp(ref_dest, opt_dest, dstStride * height * sizeof(pixel)))
+ {
+ memcpy(opt_dest, ref_dest, sizeof(ref_dest));
+ opt(ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
return false;
+ }
+
+ // check tail memory area
+ for(int x = width; x < dstStride; x++)
+ {
+ if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+ return false;
+ }
reportfail();
j += INCR;
@@ -1344,6 +1356,13 @@
if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
return false;
+ // check tail memory area
+ for(int x = width; x < dstStride; x++)
+ {
+ if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+ return false;
+ }
+
reportfail();
j += INCR;
}
_______________________________________________
x265-devel mailing list
x265-devel at videolan.org
https://mailman.videolan.org/listinfo/x265-devel
--
Deepthi Nandakumar
Engineering Manager, x265
Multicoreware, Inc
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160101/e8a8e939/attachment-0001.html>
More information about the x265-devel
mailing list