[x265] [PATCH] asm: saoCuOrgE1_2Rows avx2 code: 657c->525c

Wed Apr 8 11:45:51 CEST 2015

# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1428485849 -19800
#      Wed Apr 08 15:07:29 2015 +0530
# Node ID 5d215b2e0a64e748c2d4f2e1cb54ece6c0eb257d
# Parent  125a429989d35695971e1a1a8cff20c0b2649074
asm: saoCuOrgE1_2Rows avx2 code: 657c->525c

diff -r 125a429989d3 -r 5d215b2e0a64 source/common/x86/asm-primitives.cpp

--- a/source/common/x86/asm-primitives.cpp	Wed Apr 08 14:59:06 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 08 15:07:29 2015 +0530
@@ -1558,6 +1558,7 @@
         p.scale2D_64to32 = x265_scale2D_64to32_avx2;
         p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
         p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
+        p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r 125a429989d3 -r 5d215b2e0a64 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Wed Apr 08 14:59:06 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Wed Apr 08 15:07:29 2015 +0530
@@ -351,6 +351,55 @@
     jnz         .loop
     RET
 
+INIT_YMM avx2
+cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
+    mov             r3d,        r3m
+    mov             r4d,        r4m
+    pxor            m0,         m0                           ; m0 = 0
+    vbroadcasti128  m5,         [pb_128]
+    vbroadcasti128  m6,         [r2]                         ; m6 = m_iOffsetEo
+    shr             r4d,        4
+.loop
+    movu            xm1,        [r0]                         ; m1 = pRec[x]
+    movu            xm2,        [r0 + r3]                    ; m2 = pRec[x + iStride]
+    vinserti128     m1,         m1,       xm2,            1
+    vinserti128     m2,         m2,       [r0 + r3 * 2],  1
+
+    pxor            m3,         m1,       m5
+    pxor            m4,         m2,       m5
+    pcmpgtb         m2,         m3,       m4
+    pcmpgtb         m4,         m3
+    pand            m2,         [pb_1]
+    por             m2,         m4
+
+    movu            xm3,        [r1]                         ; xm3 = m_iUpBuff
+    psubb           m4,         m0,       m2
+    vinserti128     m3,         m3,       xm4,            1
+    paddb           m3,         m2
+    paddb           m3,         [pb_2]
+    pshufb          m2,         m6,       m3
+    vextracti128    [r1],       m4,       1
+
+    pmovzxbw        m4,         xm1
+    vextracti128    xm3,        m1,       1
+    pmovzxbw        m3,         xm3
+    pmovsxbw        m1,         xm2
+    vextracti128    xm2,        m2,       1
+    pmovsxbw        m2,         xm2
+
+    paddw           m4,         m1
+    paddw           m3,         m2
+    packuswb        m4,         m3
+    vpermq          m4,         m4,       11011000b
+    movu            [r0],       xm4
+    vextracti128    [r0 + r3],  m4,       1
+
+    add             r0,         16
+    add             r1,         16
+    dec             r4d
+    jnz             .loop
+    RET
+
 ;======================================================================================================================================================
 ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
 ;======================================================================================================================================================
diff -r 125a429989d3 -r 5d215b2e0a64 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Wed Apr 08 14:59:06 2015 +0530
+++ b/source/common/x86/loopfilter.h	Wed Apr 08 15:07:29 2015 +0530
@@ -30,6 +30,7 @@
 void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE1_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+void x265_saoCuOrgE1_2Rows_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);