[x265] [PATCH] asm: saoCuOrgE0 avx2 code: 756c->629c

Thu Apr 2 12:51:40 CEST 2015

# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1427970584 -19800
#      Thu Apr 02 15:59:44 2015 +0530
# Node ID 8f37dd7ec27deebbd44308df49c1ff5d040a3af3
# Parent  6f19f7a1ed620bdaef9ad4b63114da50d1dd5a15
asm: saoCuOrgE0 avx2 code: 756c->629c

diff -r 6f19f7a1ed62 -r 8f37dd7ec27d source/common/x86/asm-primitives.cpp

--- a/source/common/x86/asm-primitives.cpp	Thu Apr 02 10:42:41 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Apr 02 15:59:44 2015 +0530
@@ -1448,6 +1448,7 @@
     if (cpuMask & X265_CPU_AVX2)
     {
         p.scale2D_64to32 = x265_scale2D_64to32_avx2;
+        p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r 6f19f7a1ed62 -r 8f37dd7ec27d source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Thu Apr 02 10:42:41 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Thu Apr 02 15:59:44 2015 +0530
@@ -128,6 +128,60 @@
     jnz        .loopH
     RET
 
+INIT_YMM avx2
+cglobal saoCuOrgE0, 5, 6, 7, rec, offsetEo, lcuWidth, signLeft, stride
+
+    mov                 r4d,        r4m
+    movzx               r5d,        byte [r3]
+    neg                 r5b
+    movd                xm0,        r5d
+    movzx               r5d,        byte [r3 + 1]
+    neg                 r5b
+    movd                xm1,        r5d
+    vinserti128         m0,         m0,        xm1,           1
+    vbroadcasti128      m4,         [pb_128]                   ; m4 = [80]
+    vbroadcasti128      m6,         [r1]                       ; m6 = offsetEo
+
+.loop:
+    movu                xm5,        [r0]                       ; xm5 = rec[x]
+    movu                xm2,        [r0 + 1]                   ; xm2 = rec[x + 1]
+    vinserti128         m5,         m5,        [r0 + r4],     1
+    vinserti128         m2,         m2,        [r0 + r4 + 1], 1
+
+    pxor                m1,         m5,        m4
+    pxor                m3,         m2,        m4
+    pcmpgtb             m2,         m1,        m3
+    pcmpgtb             m3,         m1
+    pand                m2,         [pb_1]
+    por                 m2,         m3
+
+    pslldq              m3,         m2,        1
+    por                 m3,         m0
+
+    psignb              m3,         m4                         ; m3 = signLeft
+    pxor                m0,         m0
+    palignr             m0,         m2,        15
+    paddb               m2,         m3
+    paddb               m2,         [pb_2]                     ; m2 = uiEdgeType
+    pshufb              m3,         m6,        m2
+    pmovzxbw            m2,         xm5                        ; rec
+    vextracti128        xm5,        m5,        1
+    pmovzxbw            m5,         xm5
+    pmovsxbw            m1,         xm3                        ; offsetEo
+    vextracti128        xm3,        m3,        1
+    pmovsxbw            m3,         xm3
+    paddw               m2,         m1
+    paddw               m5,         m3
+    packuswb            m2,         m5
+    vpermq              m2,         m2,        11011000b
+    movu                [r0],       xm2
+    vextracti128        [r0 + r4],  m2,        1
+
+    add                 r0q,        16
+    sub                 r2d,        16
+    jnz                 .loop
+    RET
+
 ;==================================================================================================
 ; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)
 ;==================================================================================================
diff -r 6f19f7a1ed62 -r 8f37dd7ec27d source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Thu Apr 02 10:42:41 2015 +0530
+++ b/source/common/x86/loopfilter.h	Thu Apr 02 15:59:44 2015 +0530
@@ -26,6 +26,7 @@
 #define X265_LOOPFILTER_H
 
 void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
+void x265_saoCuOrgE0_avx2(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
 void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);