[x265] [PATCH 2 of 2] asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223)

Min Chen chenm003 at 163.com
Thu Dec 31 18:07:40 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1451581652 21600
# Node ID 59863bb9bd70f81af21c8cf53e2703adc2723a87
# Parent  9e0fe9704998425e8d014fdfdb3c12f24e6c3cd9
asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223)
---
 source/common/x86/pixel-a.asm |  327 ++++++++++-------------------------------
 source/test/pixelharness.cpp  |   25 +++-
 2 files changed, 103 insertions(+), 249 deletions(-)

diff -r 9e0fe9704998 -r 59863bb9bd70 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Thu Dec 31 10:29:43 2015 -0600
+++ b/source/common/x86/pixel-a.asm	Thu Dec 31 11:07:32 2015 -0600
@@ -8154,92 +8154,57 @@
 ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 ;------------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal downShift_16, 7,7,3
-    movd        m0, r6d        ; m0 = shift
+cglobal downShift_16, 4,7,3
+    mov         r4d, r4m
+    mov         r5d, r5m
+    movd        m0, r6m        ; m0 = shift
     add         r1, r1
+
     dec         r5d
 .loopH:
     xor         r6, r6
+
 .loopW:
     movu        m1, [r0 + r6 * 2]
-    movu        m2, [r0 + r6 * 2 + 16]
+    movu        m2, [r0 + r6 * 2 + mmsize]
     psrlw       m1, m0
     psrlw       m2, m0
     packuswb    m1, m2
     movu        [r2 + r6], m1
 
-    add         r6, 16
+    add         r6, mmsize
     cmp         r6d, r4d
-    jl          .loopW
+    jl         .loopW
 
     ; move to next row
     add         r0, r1
     add         r2, r3
     dec         r5d
-    jnz         .loopH
-
-;processing last row of every frame [To handle width which not a multiple of 16]
-
+    jnz        .loopH
+
+    ;processing last row of every frame [To handle width which not a multiple of 16]
+    ; r4d must be more than or equal to 16(mmsize)
 .loop16:
+    movu        m1, [r0 + (r4 - mmsize) * 2]
+    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
+    psrlw       m1, m0
+    psrlw       m2, m0
+    packuswb    m1, m2
+    movu        [r2 + r4 - mmsize], m1
+
+    sub         r4d, mmsize
+    jz         .end
+    cmp         r4d, mmsize
+    jge        .loop16
+
+    ; process partial pixels
     movu        m1, [r0]
-    movu        m2, [r0 + 16]
+    movu        m2, [r0 + mmsize]
     psrlw       m1, m0
     psrlw       m2, m0
     packuswb    m1, m2
     movu        [r2], m1
 
-    add         r0, 2 * mmsize
-    add         r2, mmsize
-    sub         r4d, 16
-    jz          .end
-    cmp         r4d, 15
-    jg          .loop16
-
-    cmp         r4d, 8
-    jl          .process4
-    movu        m1, [r0]
-    psrlw       m1, m0
-    packuswb    m1, m1
-    movh        [r2], m1
-
-    add         r0, mmsize
-    add         r2, 8
-    sub         r4d, 8
-    jz          .end
-
-.process4:
-    cmp         r4d, 4
-    jl          .process2
-    movh        m1,[r0]
-    psrlw       m1, m0
-    packuswb    m1, m1
-    movd        [r2], m1
-
-    add         r0, 8
-    add         r2, 4
-    sub         r4d, 4
-    jz          .end
-
-.process2:
-    cmp         r4d, 2
-    jl          .process1
-    movd        m1, [r0]
-    psrlw       m1, m0
-    packuswb    m1, m1
-    movd        r6, m1
-    mov         [r2], r6w
-
-    add         r0, 4
-    add         r2, 2
-    sub         r4d, 2
-    jz          .end
-
-.process1:
-    movd        m1, [r0]
-    psrlw       m1, m0
-    packuswb    m1, m1
-    movd        r3, m1
-    mov         [r2], r3b
 .end:
     RET
 
@@ -8248,12 +8213,16 @@
 ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 ;-------------------------------------------------------------------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal downShift_16, 6,7,3
+cglobal downShift_16, 4,7,3
+    mov         r4d, r4m
+    mov         r5d, r5m
     movd        xm0, r6m        ; m0 = shift
     add         r1d, r1d
+
     dec         r5d
 .loopH:
     xor         r6, r6
+
 .loopW:
     movu        m1, [r0 + r6 * 2 +  0]
     movu        m2, [r0 + r6 * 2 + 32]
@@ -8265,92 +8234,39 @@
 
     add         r6d, mmsize
     cmp         r6d, r4d
-    jl          .loopW
+    jl         .loopW
 
     ; move to next row
     add         r0, r1
     add         r2, r3
     dec         r5d
-    jnz         .loopH
-
-; processing last row of every frame [To handle width which not a multiple of 32]
-    mov         r6d, r4d
-    and         r4d, 31
-    shr         r6d, 5
+    jnz        .loopH
+
+    ; processing last row of every frame [To handle width which not a multiple of 32]
 
 .loop32:
-    movu        m1, [r0]
-    movu        m2, [r0 + 32]
+    movu        m1, [r0 + (r4 - mmsize) * 2]
+    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
     psrlw       m1, xm0
     psrlw       m2, xm0
     packuswb    m1, m2
-    vpermq      m1, m1, 11011000b
+    vpermq      m1, m1, q3120
+    movu        [r2 + r4 - mmsize], m1
+
+    sub         r4d, mmsize
+    jz         .end
+    cmp         r4d, mmsize
+    jge        .loop32
+
+    ; process partial pixels
+    movu        m1, [r0]
+    movu        m2, [r0 + mmsize]
+    psrlw       m1, xm0
+    psrlw       m2, xm0
+    packuswb    m1, m2
+    vpermq      m1, m1, q3120
     movu        [r2], m1
 
-    add         r0, 2*mmsize
-    add         r2, mmsize
-    dec         r6d
-    jnz         .loop32
-
-    cmp         r4d, 16
-    jl          .process8
-    movu        m1, [r0]
-    psrlw       m1, xm0
-    packuswb    m1, m1
-    vpermq      m1, m1, 10001000b
-    movu        [r2], xm1
-
-    add         r0, mmsize
-    add         r2, 16
-    sub         r4d, 16
-    jz          .end
-
-.process8:
-    cmp         r4d, 8
-    jl          .process4
-    movu        m1, [r0]
-    psrlw       m1, xm0
-    packuswb    m1, m1
-    movq        [r2], xm1
-
-    add         r0, 16
-    add         r2, 8
-    sub         r4d, 8
-    jz          .end
-
-.process4:
-    cmp         r4d, 4
-    jl          .process2
-    movq        xm1,[r0]
-    psrlw       m1, xm0
-    packuswb    m1, m1
-    movd        [r2], xm1
-
-    add         r0, 8
-    add         r2, 4
-    sub         r4d, 4
-    jz          .end
-
-.process2:
-    cmp         r4d, 2
-    jl          .process1
-    movd        xm1, [r0]
-    psrlw       m1, xm0
-    packuswb    m1, m1
-    movd        r6d, xm1
-    mov         [r2], r6w
-
-    add         r0, 4
-    add         r2, 2
-    sub         r4d, 2
-    jz          .end
-
-.process1:
-    movd        xm1, [r0]
-    psrlw       m1, xm0
-    packuswb    m1, m1
-    movd        r3d, xm1
-    mov         [r2], r3b
 .end:
     RET
 
@@ -8487,7 +8403,9 @@
 ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 ;------------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal upShift_16, 6,7,4
+cglobal upShift_16, 4,7,4
+    mov         r4d, r4m
+    mov         r5d, r5m
     movd        m0, r6m        ; m0 = shift
     mova        m3, [pw_pixel_max]
     FIX_STRIDES r1d, r3d
@@ -8515,9 +8433,25 @@
     dec         r5d
     jnz        .loopH
 
-;processing last row of every frame [To handle width which not a multiple of 16]
-
+    ;processing last row of every frame [To handle width which not a multiple of 16]
+
+    ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here
 .loop16:
+    movu        m1, [r0 + (r4 - mmsize) * 2]
+    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
+    psllw       m1, m0
+    psllw       m2, m0
+    pand        m1, m3
+    pand        m2, m3
+    movu        [r2 + (r4 - mmsize) * 2], m1
+    movu        [r2 + (r4 - mmsize) * 2 + mmsize], m2
+
+    sub         r4d, mmsize
+    jz         .end
+    cmp         r4d, mmsize
+    jge        .loop16
+
+    ; process partial pixels
     movu        m1, [r0]
     movu        m2, [r0 + mmsize]
     psllw       m1, m0
@@ -8527,56 +8461,6 @@
     movu        [r2], m1
     movu        [r2 + mmsize], m2
 
-    add         r0, 2 * mmsize
-    add         r2, 2 * mmsize
-    sub         r4d, 16
-    jz         .end
-    jg         .loop16
-
-    cmp         r4d, 8
-    jl         .process4
-    movu        m1, [r0]
-    psrlw       m1, m0
-    pand        m1, m3
-    movu        [r2], m1
-
-    add         r0, mmsize
-    add         r2, mmsize
-    sub         r4d, 8
-    jz          .end
-
-.process4:
-    cmp         r4d, 4
-    jl         .process2
-    movh        m1,[r0]
-    psllw       m1, m0
-    pand        m1, m3
-    movh        [r2], m1
-
-    add         r0, 8
-    add         r2, 8
-    sub         r4d, 4
-    jz         .end
-
-.process2:
-    cmp         r4d, 2
-    jl         .process1
-    movd        m1, [r0]
-    psllw       m1, m0
-    pand        m1, m3
-    movd        [r2], m1
-
-    add         r0, 4
-    add         r2, 4
-    sub         r4d, 2
-    jz         .end
-
-.process1:
-    movd        m1, [r0]
-    psllw       m1, m0
-    pand        m1, m3
-    movd        r3, m1
-    mov         [r2], r3w
 .end:
     RET
 
@@ -8584,9 +8468,10 @@
 ;-------------------------------------------------------------------------------------------------------------------------------------
 ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 ;-------------------------------------------------------------------------------------------------------------------------------------
-; TODO: NO TEST CODE!
 INIT_YMM avx2
-cglobal upShift_16, 6,7,4
+cglobal upShift_16, 4,7,4
+    mov         r4d, r4m
+    mov         r5d, r5m
     movd        xm0, r6m        ; m0 = shift
     vbroadcasti128 m3, [pw_pixel_max]
     FIX_STRIDES r1d, r3d
@@ -8613,12 +8498,24 @@
     dec         r5d
     jnz        .loopH
 
-; processing last row of every frame [To handle width which not a multiple of 32]
-    mov         r6d, r4d
-    and         r4d, 31
-    shr         r6d, 5
+    ; processing last row of every frame [To handle width which not a multiple of 32]
 
 .loop32:
+    movu        m1, [r0 + (r4 - mmsize) * 2]
+    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
+    psllw       m1, xm0
+    psllw       m2, xm0
+    pand        m1, m3
+    pand        m2, m3
+    movu        [r2 + (r4 - mmsize) * 2], m1
+    movu        [r2 + (r4 - mmsize) * 2 + mmsize], m2
+
+    sub         r4d, mmsize
+    jz         .end
+    cmp         r4d, mmsize
+    jge        .loop32
+
+    ; process partial pixels
     movu        m1, [r0]
     movu        m2, [r0 + mmsize]
     psllw       m1, xm0
@@ -8628,68 +8525,6 @@
     movu        [r2], m1
     movu        [r2 + mmsize], m2
 
-    add         r0, 2*mmsize
-    add         r2, 2*mmsize
-    dec         r6d
-    jnz        .loop32
-
-    cmp         r4d, 16
-    jl         .process8
-    movu        m1, [r0]
-    psllw       m1, xm0
-    pand        m1, m3
-    movu        [r2], m1
-
-    add         r0, mmsize
-    add         r2, mmsize
-    sub         r4d, 16
-    jz         .end
-
-.process8:
-    cmp         r4d, 8
-    jl         .process4
-    movu        xm1, [r0]
-    psllw       xm1, xm0
-    pand        xm1, xm3
-    movu        [r2], xm1
-
-    add         r0, 16
-    add         r2, 16
-    sub         r4d, 8
-    jz         .end
-
-.process4:
-    cmp         r4d, 4
-    jl          .process2
-    movq        xm1,[r0]
-    psllw       xm1, xm0
-    pand        xm1, xm3
-    movq        [r2], xm1
-
-    add         r0, 8
-    add         r2, 8
-    sub         r4d, 4
-    jz         .end
-
-.process2:
-    cmp         r4d, 2
-    jl         .process1
-    movd        xm1, [r0]
-    psllw       xm1, xm0
-    pand        xm1, xm3
-    movd        [r2], xm1
-
-    add         r0, 4
-    add         r2, 4
-    sub         r4d, 2
-    jz         .end
-
-.process1:
-    movd        xm1, [r0]
-    psllw       xm1, xm0
-    pand        xm1, xm3
-    movd        r3d, xm1
-    mov         [r2], r3w
 .end:
     RET
 
diff -r 9e0fe9704998 -r 59863bb9bd70 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Thu Dec 31 10:29:43 2015 -0600
+++ b/source/test/pixelharness.cpp	Thu Dec 31 11:07:32 2015 -0600
@@ -1299,8 +1299,8 @@
 
     memset(ref_dest, 0xCD, sizeof(ref_dest));
     memset(opt_dest, 0xCD, sizeof(opt_dest));
-    int width = 32 + rand() % 32;
-    int height = 32 + rand() % 32;
+    int width = 32 + (rand() % 32);
+    int height = 32 + (rand() % 32);
     intptr_t srcStride = 64;
     intptr_t dstStride = width;
     int j = 0;
@@ -1308,11 +1308,23 @@
     for (int i = 0; i < ITERS; i++)
     {
         int index = i % TEST_CASES;
+
         checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
         ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
 
-        if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))
+        if (memcmp(ref_dest, opt_dest, dstStride * height * sizeof(pixel)))
+        {
+            memcpy(opt_dest, ref_dest, sizeof(ref_dest));
+            opt(ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
             return false;
+        }
+
+        // check tail memory area
+        for(int x = width; x < dstStride; x++)
+        {
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+                return false;
+        }
 
         reportfail();
         j += INCR;
@@ -1344,6 +1356,13 @@
         if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
             return false;
 
+        // check tail memory area
+        for(int x = width; x < dstStride; x++)
+        {
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+                return false;
+        }
+
         reportfail();
         j += INCR;
     }



More information about the x265-devel mailing list