[x265] [PATCH 2 of 2] asm: rewrite and fix bug in weight_sp_sse4 on HIGH_BIT_DEPTH mode

Mon Jan 19 11:22:01 CET 2015

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1421662910 -28800
# Node ID b2f64dbe26392dd6bea2badaccf2869bec883392
# Parent  a0bb3bb1b076d2ef559ab94bfe81052142d302c3
asm: rewrite and fix bug in weight_sp_sse4 on HIGH_BIT_DEPTH mode
---
 source/common/pixel.cpp              |    7 ++
 source/common/x86/asm-primitives.cpp |    2 +-
 source/common/x86/const-a.asm        |    1 +
 source/common/x86/pixel-util8.asm    |  134 +++++++++++++++++-----------------
 source/test/pixelharness.cpp         |   10 ++-
 5 files changed, 81 insertions(+), 73 deletions(-)

diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/pixel.cpp

--- a/source/common/pixel.cpp	Mon Jan 19 18:21:45 2015 +0800
+++ b/source/common/pixel.cpp	Mon Jan 19 18:21:50 2015 +0800
@@ -520,6 +520,13 @@
 {
     int x, y;
 
+    const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
+
+    X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
+    X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
+    X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");
+    X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n");
+
     for (y = 0; y <= height - 1; y++)
     {
         for (x = 0; x <= width - 1; )
diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jan 19 18:21:45 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 19 18:21:50 2015 +0800
@@ -925,7 +925,7 @@
         p.planecopy_cp = x265_upShift_8_sse4;
         // these fail unit tests
         p.weight_pp = x265_weight_pp_sse4;
-        // p.weight_sp = x265_weight_sp_sse4;
+        p.weight_sp = x265_weight_sp_sse4;
 
         p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
 #if X86_64
diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Mon Jan 19 18:21:45 2015 +0800
+++ b/source/common/x86/const-a.asm	Mon Jan 19 18:21:50 2015 +0800
@@ -63,6 +63,7 @@
 const pb_128,      times 16 db 128
 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
 
+const pw_0_15,     times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
 const pw_2,        times 8 dw 2
 const pw_m2,       times 8 dw -2
 const pw_4,        times 8 dw 4
diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Jan 19 18:21:45 2015 +0800
+++ b/source/common/x86/pixel-util8.asm	Mon Jan 19 18:21:50 2015 +0800
@@ -53,6 +53,7 @@
 SECTION .text
 
 cextern pw_1
+cextern pw_0_15
 cextern pb_1
 cextern pw_00ff
 cextern pw_1023
@@ -63,7 +64,6 @@
 cextern pd_32767
 cextern pd_n32768
 
-
 ;-----------------------------------------------------------------------------
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
 ;-----------------------------------------------------------------------------
@@ -986,84 +986,82 @@
 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
 ;-------------------------------------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-%if ARCH_X86_64
-cglobal weight_sp, 6, 7+2, 7
-    %define tmp_r0      r7
-    %define tmp_r1      r8
-%else ; ARCH_X86_64 = 0
-cglobal weight_sp, 6, 7, 7, 0-(2*4)
-    %define tmp_r0      [(rsp + 0 * 4)]
-    %define tmp_r1      [(rsp + 1 * 4)]
-%endif ; ARCH_X86_64
-
-    movd        m0, r6m         ; m0 = [w0]
-
-    movd        m1, r7m         ; m1 = [round]
-    punpcklwd   m0, m1
-    pshufd      m0, m0, 0       ; m0 = [w0 round]
-
-    movd        m1, r8m         ; m1 = [shift]
-
-    movd        m2, r9m
-    pshufd      m2, m2, 0       ; m2 =[offset]
-
-    mova        m3, [pw_1]
-    mova        m4, [pw_2000]
-
+cglobal weight_sp, 6,7,8
+%if BIT_DEPTH == 10
+    mova        m1, [pw_1023]
+%elif BIT_DEPTH == 12
+    mova        m1, [pw_3fff]
+%else
+  %error Unsupported BIT_DEPTH!
+%endif
+    mova        m2, [pw_1]
+    mov         r6d, r7m
+    shl         r6d, 16
+    or          r6d, r6m    ; assuming both (w0) and round are using maximum of 16 bits each.
+    movd        m3, r6d
+    pshufd      m3, m3, 0   ; m3 = [round w0]
+
+    movd        m4, r8m     ; m4 = [shift]
+    movd        m5, r9m
+    pshufd      m5, m5, 0   ; m5 = [offset]
+
+    ; correct row stride
+    add         r3d, r3d
     add         r2d, r2d
+    mov         r6d, r4d
+    and         r6d, ~(mmsize / SIZEOF_PIXEL - 1)
+    sub         r3d, r6d
+    sub         r3d, r6d
+    sub         r2d, r6d
+    sub         r2d, r6d
+
+    ; generate partial width mask (MUST BE IN XMM0)
+    mov         r6d, r4d
+    and         r6d, (mmsize / SIZEOF_PIXEL - 1)
+    movd        m0, r6d
+    pshuflw     m0, m0, 0
+    punpcklqdq  m0, m0
+    pcmpgtw     m0, [pw_0_15]
 
 .loopH:
     mov         r6d, r4d
 
-    ; save old src and dst
-    mov         tmp_r0, r0
-    mov         tmp_r1, r1
 .loopW:
-    movu        m5, [r0]
-    paddw       m5, m4
-
-    punpcklwd   m6,m5, m3
-    pmaddwd     m6, m0
-    psrad       m6, m1
-    paddd       m6, m2
-
-    punpckhwd   m5, m3
-    pmaddwd     m5, m0
-    psrad       m5, m1
-    paddd       m5, m2
-
-    packssdw    m6, m5
-    packuswb    m6, m6
-
-    sub         r6d, 8
-    jl          .width4
-    movh        [r1], m6
-    je          .nextH
-    add         r0, 16
-    add         r1, 8
-
-    jmp         .loopW
-
-.width4:
-    cmp         r6d, -4
-    jl          .width2
-    movd        [r1], m6
-    je          .nextH
-    add         r1, 4
-    pshufd      m6, m6, 1
-
-.width2:
-    pextrw      [r1], m6, 0
+    movu        m6, [r0]
+    paddw       m6, [pw_2000]
+
+    punpcklwd   m7, m6, m2
+    pmaddwd     m7, m3
+    psrad       m7, m4
+    paddd       m7, m5
+
+    punpckhwd   m6, m2
+    pmaddwd     m6, m3
+    psrad       m6, m4
+    paddd       m6, m5
+
+    packusdw    m7, m6
+    pminuw      m7, m1
+
+    sub         r6d, (mmsize / SIZEOF_PIXEL)
+    jl         .widthLess8
+    movu        [r1], m7
+    lea         r0, [r0 + mmsize]
+    lea         r1, [r1 + mmsize]
+    je         .nextH
+    jmp        .loopW
+
+.widthLess8:
+    movu        m6, [r1]
+    pblendvb    m6, m6, m7, m0
+    movu        [r1], m6
 
 .nextH:
-    mov         r0, tmp_r0
-    mov         r1, tmp_r1
-    lea         r0, [r0 + r2]
-    lea         r1, [r1 + r3]
+    add         r0, r2
+    add         r1, r3
 
     dec         r5d
     jnz         .loopH
-
     RET
 
 ;-----------------------------------------------------------------
diff -r a0bb3bb1b076 -r b2f64dbe2639 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Jan 19 18:21:45 2015 +0800
+++ b/source/test/pixelharness.cpp	Mon Jan 19 18:21:50 2015 +0800
@@ -222,8 +222,8 @@
 
 bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
 {
-    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
-    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * (64 + 1)]);
 
     memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
     memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
@@ -236,11 +236,12 @@
     int offset = (rand() % 256) - 128;
     intptr_t stride = 64;
     const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
+
     for (int i = 0; i < ITERS; i++)
     {
         int index = i % TEST_CASES;
-        checked(opt, short_test_buff[index] + j, opt_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);
-        ref(short_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);
+        checked(opt, short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
+        ref(short_test_buff[index] + j, ref_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
         {
@@ -264,6 +265,7 @@
                 printf("\n");
             }
             printf("\n");
+            opt(short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
             return false;
         }