[x265] [PATCH 3 of 6] asm: 10bpp AVX2 code for saoCuOrgE1_2Rows, improved 900c->614c over SSE

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Jun 25 10:23:51 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435213462 -19800
#      Thu Jun 25 11:54:22 2015 +0530
# Node ID f43aa44673dcd8e96581c938cf22ad4bbb7657e3
# Parent  31da07b7198ca730bae37577d5053a3337477f7b
asm: 10bpp AVX2 code for saoCuOrgE1_2Rows, improved 900c->614c over SSE

diff -r 31da07b7198c -r f43aa44673dc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jun 25 11:49:07 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 25 11:54:22 2015 +0530
@@ -1286,6 +1286,7 @@
     {
         p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
         p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
+        p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
 
         p.cu[BLOCK_16x16].intra_pred[2]     = PFX(intra_pred_ang16_2_avx2);
         p.cu[BLOCK_16x16].intra_pred[3]     = PFX(intra_pred_ang16_3_avx2);
diff -r 31da07b7198c -r f43aa44673dc source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Thu Jun 25 11:49:07 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Thu Jun 25 11:54:22 2015 +0530
@@ -728,6 +728,62 @@
 %endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE1_2Rows, 4,5,8
+    add             r3d, r3d
+    mov             r4d, r4m
+    mova            m4, [pw_1023]
+    vbroadcasti128  m6, [r2]                ; m6 = m_iOffsetEo
+    shr             r4d, 4
+.loop
+    movu            m7, [r0]
+    movu            m5, [r0 + r3]
+    movu            m1, [r0 + r3 * 2]
+
+    pcmpgtw         m2, m7, m5
+    pcmpgtw         m3, m5, m7
+    pcmpgtw         m0, m5, m1
+    pcmpgtw         m1, m5
+
+    packsswb        m2, m0
+    packsswb        m3, m1
+    vpermq          m2, m2, 11011000b
+    vpermq          m3, m3, 11011000b
+
+    pand            m2, [pb_1]
+    por             m2, m3
+
+    movu            xm3, [r1]               ; m3 = m_iUpBuff1
+    pxor            m0, m0
+    psubb           m1, m0, m2
+    vinserti128     m3, m3, xm1, 1
+    vextracti128    [r1], m1, 1
+
+    paddb           m3, m2
+    paddb           m3, [pb_2]
+
+    pshufb          m1, m6, m3
+    pmovsxbw        m3, xm1
+    vextracti128    xm1, m1, 1
+    pmovsxbw        m1, xm1
+
+    paddw           m7, m3
+    paddw           m5, m1
+
+    pmaxsw          m7, m0
+    pmaxsw          m5, m0
+    pminsw          m7, m4
+    pminsw          m5, m4
+
+    movu            [r0], m7
+    movu            [r0 + r3],  m5
+
+    add             r0, 32
+    add             r1, 16
+    dec             r4d
+    jnz             .loop
+    RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE1_2Rows, 3, 5, 7, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
     mov             r3d,        r3m
     mov             r4d,        r4m
@@ -775,6 +831,7 @@
     dec             r4d
     jnz             .loop
     RET
+%endif
 
 ;======================================================================================================================================================
 ; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)


More information about the x265-devel mailing list