[x265] [PATCH] asm: saoCuOrgE3 avx2 code for width>16: improve 508c->427c

Divya Manivannan divya at multicorewareinc.com
Wed Apr 22 09:01:09 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429685709 -19800
#      Wed Apr 22 12:25:09 2015 +0530
# Node ID 4c123bec3fe04af65e3c875ff11b27f1f333f9be
# Parent  584211b333ac9640d81423b3f60a18956425e27c
asm: saoCuOrgE3 avx2 code for width>16: improve 508c->427c

diff -r 584211b333ac -r 4c123bec3fe0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Apr 22 11:59:36 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 22 12:25:09 2015 +0530
@@ -1729,6 +1729,7 @@
         p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
         p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
         p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
+        p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
         p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
diff -r 584211b333ac -r 4c123bec3fe0 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Wed Apr 22 11:59:36 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Wed Apr 22 12:25:09 2015 +0530
@@ -582,6 +582,72 @@
     movhps          [r1 + r5 - 1], xm7
     RET
 
+INIT_YMM avx2
+cglobal saoCuOrgE3_32, 3, 6, 8
+    mov             r3d,  r3m
+    mov             r4d,  r4m
+    mov             r5d,  r5m
+
+    ; save latest 2 pixels for case startX=1 or left_endX=15
+    movq            xm7,  [r0 + r5]
+    movhps          xm7,  [r1 + r5 - 1]
+
+    ; move to startX+1
+    inc             r4d
+    add             r0,   r4
+    add             r1,   r4
+    sub             r5d,  r4d
+    pxor            m0,   m0                      ; m0 = 0
+    mova            m6,   [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+    vbroadcasti128  m5,   [r2]                    ; m5 = m_iOffsetEo
+
+.loop:
+    movu            m1,   [r0]                    ; m1 = pRec[x]
+    movu            m2,   [r0 + r3]               ; m2 = pRec[x + iStride]
+
+    psubusb         m3,   m2,   m1
+    psubusb         m4,   m1,   m2
+    pcmpeqb         m3,   m0
+    pcmpeqb         m4,   m0
+    pcmpeqb         m2,   m1
+
+    pabsb           m3,   m3
+    por             m4,   m3
+    pandn           m2,   m4                      ; m2 = iSignDown
+
+    movu            m3,   [r1]                    ; m3 = m_iUpBuff1
+
+    paddb           m3,   m2
+    paddb           m3,   m6                      ; m3 = uiEdgeType
+
+    pshufb          m4,   m5,   m3
+
+    psubb           m3,   m0,   m2
+    movu            [r1 - 1],   m3
+
+    pmovzxbw        m2,   xm1
+    vextracti128    xm1,  m1,   1
+    pmovzxbw        m1,   xm1
+    pmovsxbw        m3,   xm4
+    vextracti128    xm4,  m4,   1
+    pmovsxbw        m4,   xm4
+
+    paddw           m2,   m3
+    paddw           m1,   m4
+    packuswb        m2,   m1
+    vpermq          m2,   m2,   11011000b
+    movu            [r0], m2
+
+    add             r0,   32
+    add             r1,   32
+    sub             r5,   32
+    jg             .loop
+
+    ; restore last pixels (up to 2)
+    movq            [r0 + r5],     xm7
+    movhps          [r1 + r5 - 1], xm7
+    RET
+
 ;=====================================================================================
 ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
 ;=====================================================================================
diff -r 584211b333ac -r 4c123bec3fe0 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Wed Apr 22 11:59:36 2015 +0530
+++ b/source/common/x86/loopfilter.h	Wed Apr 22 12:25:09 2015 +0530
@@ -34,6 +34,7 @@
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
+void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);


More information about the x265-devel mailing list