[x265] [PATCH] asm: add pixel restoration part in saoCuOrgE2 primitive

Divya Manivannan divya at multicorewareinc.com
Fri Apr 24 10:49:32 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429864855 -19800
#      Fri Apr 24 14:10:55 2015 +0530
# Node ID cfc321e81396f4ad93b1bb92f786d5d233acadf8
# Parent  deea3a0293187e142884b9aa2a719468f1ce5be6
asm: add pixel restoration part in saoCuOrgE2 primitive

diff -r deea3a029318 -r cfc321e81396 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Fri Apr 24 13:47:58 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Fri Apr 24 14:10:55 2015 +0530
@@ -406,60 +406,66 @@
 ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
 ;======================================================================================================================================================
 INIT_XMM sse4
-cglobal saoCuOrgE2, 5, 7, 8, rec, bufft, buff1, offsetEo, lcuWidth
-
-    mov         r6,    16
+cglobal saoCuOrgE2, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth
+    mov         r4d,   r4m
     mov         r5d,   r5m
     pxor        m0,    m0                      ; m0 = 0
     mova        m6,    [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
     mova        m7,    [pb_128]
-    shr         r4d,   4
-    inc         r1q
+    inc         r1
+    movh        m5,    [r0 + r4]
+    movhps      m5,    [r1 + r4]
 
-    .loop
-         movu        m1,    [r0]                    ; m1 = rec[x]
-         movu        m2,    [r0 + r5 + 1]           ; m2 = rec[x + stride + 1]
-         pxor        m3,    m1,    m7
-         pxor        m4,    m2,    m7
-         pcmpgtb     m2,    m3,    m4
-         pcmpgtb     m4,    m3
-         pand        m2,    [pb_1]
-         por         m2,    m4
-         movu        m3,    [r2]                    ; m3 = buff1
+.loop
+    movu        m1,    [r0]                    ; m1 = rec[x]
+    movu        m2,    [r0 + r5 + 1]           ; m2 = rec[x + stride + 1]
+    pxor        m3,    m1,    m7
+    pxor        m4,    m2,    m7
+    pcmpgtb     m2,    m3,    m4
+    pcmpgtb     m4,    m3
+    pand        m2,    [pb_1]
+    por         m2,    m4
+    movu        m3,    [r2]                    ; m3 = buff1
 
-         paddb       m3,    m2
-         paddb       m3,    m6                      ; m3 = edgeType
+    paddb       m3,    m2
+    paddb       m3,    m6                      ; m3 = edgeType
 
-         movu        m4,    [r3]                    ; m4 = offsetEo
-         pshufb      m4,    m3
+    movu        m4,    [r3]                    ; m4 = offsetEo
+    pshufb      m4,    m3
 
-         psubb       m3,    m0,    m2
-         movu        [r1],  m3
+    psubb       m3,    m0,    m2
+    movu        [r1],  m3
 
-         pmovzxbw    m2,    m1
-         punpckhbw   m1,    m0
-         pmovsxbw    m3,    m4
-         punpckhbw   m4,    m4
-         psraw       m4,    8
+    pmovzxbw    m2,    m1
+    punpckhbw   m1,    m0
+    pmovsxbw    m3,    m4
+    punpckhbw   m4,    m4
+    psraw       m4,    8
 
-         paddw       m2,    m3
-         paddw       m1,    m4
-         packuswb    m2,    m1
-         movu        [r0],  m2
+    paddw       m2,    m3
+    paddw       m1,    m4
+    packuswb    m2,    m1
+    movu        [r0],  m2
 
-         add         r0,    r6
-         add         r1,    r6
-         add         r2,    r6
-         dec         r4d
-         jnz         .loop
+    add         r0,    16
+    add         r1,    16
+    add         r2,    16
+    sub         r4,    16
+    jg          .loop
+
+    movh        [r0 + r4], m5
+    movhps      [r1 + r4], m5
     RET
 
 INIT_YMM avx2
-cglobal saoCuOrgE2, 5, 6, 6, rec, bufft, buff1, offsetEo, lcuWidth
+cglobal saoCuOrgE2, 5, 6, 7, rec, bufft, buff1, offsetEo, lcuWidth
+    mov            r4d,   r4m
     mov            r5d,   r5m
     pxor           xm0,   xm0                     ; xm0 = 0
     mova           xm5,   [pb_128]
     inc            r1
+    movq           xm6,   [r0 + r4]
+    movhps         xm6,   [r1 + r4]
 
     movu           xm1,   [r0]                    ; xm1 = rec[x]
     movu           xm2,   [r0 + r5 + 1]           ; xm2 = rec[x + stride + 1]
@@ -487,17 +493,21 @@
     vextracti128   xm3,   m2,    1
     packuswb       xm2,   xm3
     movu           [r0],  xm2
+
+    movq           [r0 + r4], xm6
+    movhps         [r1 + r4], xm6
     RET
 
 INIT_YMM avx2
 cglobal saoCuOrgE2_32, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth
+    mov             r4d,   r4m
     mov             r5d,   r5m
     pxor            m0,    m0                      ; m0 = 0
-    mova            m6,    [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
     vbroadcasti128  m7,    [pb_128]
     vbroadcasti128  m5,    [r3]                    ; m5 = offsetEo
-    shr             r4d,   5
     inc             r1
+    movq            xm6,   [r0 + r4]
+    movhps          xm6,   [r1 + r4]
 
 .loop:
     movu            m1,    [r0]                    ; m1 = rec[x]
@@ -511,7 +521,7 @@
     movu            m3,    [r2]                    ; m3 = buff1
 
     paddb           m3,    m2
-    paddb           m3,    m6                      ; m3 = edgeType
+    paddb           m3,    [pb_2]                  ; m3 = edgeType
 
     pshufb          m4,    m5,    m3
 
@@ -534,8 +544,11 @@
     add             r0,    32
     add             r1,    32
     add             r2,    32
-    dec             r4d
-    jnz             .loop
+    sub             r4,    32
+    jg              .loop
+
+    movq            [r0 + r4], xm6
+    movhps          [r1 + r4], xm6
     RET
 
 ;=======================================================================================================
diff -r deea3a029318 -r cfc321e81396 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Fri Apr 24 13:47:58 2015 +0530
+++ b/source/encoder/sao.cpp	Fri Apr 24 14:10:55 2015 +0530
@@ -437,23 +437,8 @@
             for (y = startY; y < endY; y++)
             {
                 int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
-                pixel firstPxl = rec[0];  // copy first Pxl
-                pixel lastPxl = rec[ctuWidth - 1];
-                int8_t one = upBufft[1];
-                int8_t two = upBufft[endX + 1];
 
-                primitives.saoCuOrgE2[ctuWidth > 16](rec, upBufft, upBuff1, m_offsetEo, ctuWidth, stride);
-                if (startX)
-                {
-                    rec[0] = firstPxl;
-                    upBufft[1] = one;
-                }
-
-                if (rpelx == picWidth)
-                {
-                    rec[ctuWidth - 1] = lastPxl;
-                    upBufft[endX + 1] = two;
-                }
+                primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, m_offsetEo, endX - startX, stride);
 
                 upBufft[startX] = iSignDown2;
 
diff -r deea3a029318 -r cfc321e81396 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Fri Apr 24 13:47:58 2015 +0530
+++ b/source/test/pixelharness.cpp	Fri Apr 24 14:10:55 2015 +0530
@@ -966,7 +966,7 @@
         {
             for (int i = 0; i < ITERS; i++)
             {
-                int width = 16 * (1 << (id * (rand() % 2 + 1)));
+                int width = 16 * (1 << (id * (rand() % 2 + 1))) - (rand() % 2);
                 int stride = width + 1;
 
                 ref[width > 16](ref_dest, psbuf1 + j, psbuf2 + j, psbuf3 + j, width, stride);


More information about the x265-devel mailing list