<div dir="ltr">Thanks, Min. Pushed.<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Jan 19, 2015 at 3:52 PM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>

# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>

# Date 1421662910 -28800<br>

# Node ID b2f64dbe26392dd6bea2badaccf2869bec883392<br>

# Parent  a0bb3bb1b076d2ef559ab94bfe81052142d302c3<br>

asm: rewrite and fix bug in weight_sp_sse4 on HIGH_BIT_DEPTH mode<br>

---<br>

 source/common/pixel.cpp              |    7 ++<br>

 source/common/x86/asm-primitives.cpp |    2 +-<br>

 source/common/x86/const-a.asm        |    1 +<br>

 source/common/x86/pixel-util8.asm    |  134 +++++++++++++++++-----------------<br>

 source/test/pixelharness.cpp         |   10 ++-<br>

 5 files changed, 81 insertions(+), 73 deletions(-)<br>

<br>

diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/pixel.cpp<br>

--- a/source/common/pixel.cpp   Mon Jan 19 18:21:45 2015 +0800<br>

+++ b/source/common/pixel.cpp   Mon Jan 19 18:21:50 2015 +0800<br>

@@ -520,6 +520,13 @@<br>

 {<br>

     int x, y;<br>

<br>

+    const int correction = (IF_INTERNAL_PREC - X265_DEPTH);<br>

+<br>

+    X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");<br>

+    X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");<br>

+    X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");<br>

+    X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n");<br>

+<br>

     for (y = 0; y <= height - 1; y++)<br>

     {<br>

         for (x = 0; x <= width - 1; )<br>

diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/asm-primitives.cpp<br>

--- a/source/common/x86/asm-primitives.cpp      Mon Jan 19 18:21:45 2015 +0800<br>

+++ b/source/common/x86/asm-primitives.cpp      Mon Jan 19 18:21:50 2015 +0800<br>

@@ -925,7 +925,7 @@<br>

         p.planecopy_cp = x265_upShift_8_sse4;<br>

         // these fail unit tests<br>

         p.weight_pp = x265_weight_pp_sse4;<br>

-        // p.weight_sp = x265_weight_sp_sse4;<br>

+        p.weight_sp = x265_weight_sp_sse4;<br>

<br>

         <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;<br>

 #if X86_64<br>

diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/const-a.asm<br>

--- a/source/common/x86/const-a.asm     Mon Jan 19 18:21:45 2015 +0800<br>

+++ b/source/common/x86/const-a.asm     Mon Jan 19 18:21:50 2015 +0800<br>

@@ -63,6 +63,7 @@<br>

 const pb_128,      times 16 db 128<br>

 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6<br>

<br>

+const pw_0_15,     times 2 dw 0, 1, 2, 3, 4, 5, 6, 7<br>

 const pw_2,        times 8 dw 2<br>

 const pw_m2,       times 8 dw -2<br>

 const pw_4,        times 8 dw 4<br>

diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/pixel-util8.asm<br>

--- a/source/common/x86/pixel-util8.asm Mon Jan 19 18:21:45 2015 +0800<br>

+++ b/source/common/x86/pixel-util8.asm Mon Jan 19 18:21:50 2015 +0800<br>

@@ -53,6 +53,7 @@<br>

 SECTION .text<br>

<br>

 cextern pw_1<br>

+cextern pw_0_15<br>

 cextern pb_1<br>

 cextern pw_00ff<br>

 cextern pw_1023<br>

@@ -63,7 +64,6 @@<br>

 cextern pd_32767<br>

 cextern pd_n32768<br>

<br>

-<br>

 ;-----------------------------------------------------------------------------<br>

 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)<br>

 ;-----------------------------------------------------------------------------<br>

@@ -986,84 +986,82 @@<br>

 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)<br>

 ;-------------------------------------------------------------------------------------------------------------------------------------------------<br>

 INIT_XMM sse4<br>

-%if ARCH_X86_64<br>

-cglobal weight_sp, 6, 7+2, 7<br>

-    %define tmp_r0      r7<br>

-    %define tmp_r1      r8<br>

-%else ; ARCH_X86_64 = 0<br>

-cglobal weight_sp, 6, 7, 7, 0-(2*4)<br>

-    %define tmp_r0      [(rsp + 0 * 4)]<br>

-    %define tmp_r1      [(rsp + 1 * 4)]<br>

-%endif ; ARCH_X86_64<br>

-<br>

-    movd        m0, r6m         ; m0 = [w0]<br>

-<br>

-    movd        m1, r7m         ; m1 = [round]<br>

-    punpcklwd   m0, m1<br>

-    pshufd      m0, m0, 0       ; m0 = [w0 round]<br>

-<br>

-    movd        m1, r8m         ; m1 = [shift]<br>

-<br>

-    movd        m2, r9m<br>

-    pshufd      m2, m2, 0       ; m2 =[offset]<br>

-<br>

-    mova        m3, [pw_1]<br>

-    mova        m4, [pw_2000]<br>

-<br>

+cglobal weight_sp, 6,7,8<br>

+%if BIT_DEPTH == 10<br>

+    mova        m1, [pw_1023]<br>

+%elif BIT_DEPTH == 12<br>

+    mova        m1, [pw_3fff]<br>

+%else<br>

+  %error Unsupported BIT_DEPTH!<br>

+%endif<br>

+    mova        m2, [pw_1]<br>

+    mov         r6d, r7m<br>

+    shl         r6d, 16<br>

+    or          r6d, r6m    ; assuming both (w0) and round are using maximum of 16 bits each.<br>

+    movd        m3, r6d<br>

+    pshufd      m3, m3, 0   ; m3 = [round w0]<br>

+<br>

+    movd        m4, r8m     ; m4 = [shift]<br>

+    movd        m5, r9m<br>

+    pshufd      m5, m5, 0   ; m5 = [offset]<br>

+<br>

+    ; correct row stride<br>

+    add         r3d, r3d<br>

     add         r2d, r2d<br>

+    mov         r6d, r4d<br>

+    and         r6d, ~(mmsize / SIZEOF_PIXEL - 1)<br>

+    sub         r3d, r6d<br>

+    sub         r3d, r6d<br>

+    sub         r2d, r6d<br>

+    sub         r2d, r6d<br>

+<br>

+    ; generate partial width mask (MUST BE IN XMM0)<br>

+    mov         r6d, r4d<br>

+    and         r6d, (mmsize / SIZEOF_PIXEL - 1)<br>

+    movd        m0, r6d<br>

+    pshuflw     m0, m0, 0<br>

+    punpcklqdq  m0, m0<br>

+    pcmpgtw     m0, [pw_0_15]<br>

<br>

 .loopH:<br>

     mov         r6d, r4d<br>

<br>

-    ; save old src and dst<br>

-    mov         tmp_r0, r0<br>

-    mov         tmp_r1, r1<br>

 .loopW:<br>

-    movu        m5, [r0]<br>

-    paddw       m5, m4<br>

-<br>

-    punpcklwd   m6,m5, m3<br>

-    pmaddwd     m6, m0<br>

-    psrad       m6, m1<br>

-    paddd       m6, m2<br>

-<br>

-    punpckhwd   m5, m3<br>

-    pmaddwd     m5, m0<br>

-    psrad       m5, m1<br>

-    paddd       m5, m2<br>

-<br>

-    packssdw    m6, m5<br>

-    packuswb    m6, m6<br>

-<br>

-    sub         r6d, 8<br>

-    jl          .width4<br>

-    movh        [r1], m6<br>

-    je          .nextH<br>

-    add         r0, 16<br>

-    add         r1, 8<br>

-<br>

-    jmp         .loopW<br>

-<br>

-.width4:<br>

-    cmp         r6d, -4<br>

-    jl          .width2<br>

-    movd        [r1], m6<br>

-    je          .nextH<br>

-    add         r1, 4<br>

-    pshufd      m6, m6, 1<br>

-<br>

-.width2:<br>

-    pextrw      [r1], m6, 0<br>

+    movu        m6, [r0]<br>

+    paddw       m6, [pw_2000]<br>

+<br>

+    punpcklwd   m7, m6, m2<br>

+    pmaddwd     m7, m3<br>

+    psrad       m7, m4<br>

+    paddd       m7, m5<br>

+<br>

+    punpckhwd   m6, m2<br>

+    pmaddwd     m6, m3<br>

+    psrad       m6, m4<br>

+    paddd       m6, m5<br>

+<br>

+    packusdw    m7, m6<br>

+    pminuw      m7, m1<br>

+<br>

+    sub         r6d, (mmsize / SIZEOF_PIXEL)<br>

+    jl         .widthLess8<br>

+    movu        [r1], m7<br>

+    lea         r0, [r0 + mmsize]<br>

+    lea         r1, [r1 + mmsize]<br>

+    je         .nextH<br>

+    jmp        .loopW<br>

+<br>

+.widthLess8:<br>

+    movu        m6, [r1]<br>

+    pblendvb    m6, m6, m7, m0<br>

+    movu        [r1], m6<br>

<br>

 .nextH:<br>

-    mov         r0, tmp_r0<br>

-    mov         r1, tmp_r1<br>

-    lea         r0, [r0 + r2]<br>

-    lea         r1, [r1 + r3]<br>

+    add         r0, r2<br>

+    add         r1, r3<br>

<br>

     dec         r5d<br>

     jnz         .loopH<br>

-<br>

     RET<br>

<br>

 ;-----------------------------------------------------------------<br>

diff -r a0bb3bb1b076 -r b2f64dbe2639 source/test/pixelharness.cpp<br>

--- a/source/test/pixelharness.cpp      Mon Jan 19 18:21:45 2015 +0800<br>

+++ b/source/test/pixelharness.cpp      Mon Jan 19 18:21:50 2015 +0800<br>

@@ -222,8 +222,8 @@<br>

<br>

 bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)<br>

 {<br>

-    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);<br>

-    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);<br>

+    ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);<br>

+    ALIGN_VAR_16(pixel, opt_dest[64 * (64 + 1)]);<br>

<br>

     memset(ref_dest, 0, 64 * 64 * sizeof(pixel));<br>

     memset(opt_dest, 0, 64 * 64 * sizeof(pixel));<br>

@@ -236,11 +236,12 @@<br>

     int offset = (rand() % 256) - 128;<br>

     intptr_t stride = 64;<br>

     const int correction = (IF_INTERNAL_PREC - X265_DEPTH);<br>

+<br>

     for (int i = 0; i < ITERS; i++)<br>

     {<br>

         int index = i % TEST_CASES;<br>

-        checked(opt, short_test_buff[index] + j, opt_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);<br>

-        ref(short_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);<br>

+        checked(opt, short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);<br>

+        ref(short_test_buff[index] + j, ref_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);<br>

<br>

         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))<br>

         {<br>

@@ -264,6 +265,7 @@<br>

                 printf("\n");<br>

             }<br>

             printf("\n");<br>

+            opt(short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);<br>

             return false;<br>

         }<br>

<br>

<br>

_______________________________________________<br>

x265-devel mailing list<br>

<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>

<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>

</blockquote></div><br></div>