[x265] [PATCH] asm: saoCuOrgE3 avx2 code: 502c->473c

Divya Manivannan divya at multicorewareinc.com
Wed Apr 15 11:33:20 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429088096 -19800
#      Wed Apr 15 14:24:56 2015 +0530
# Node ID 55c9c34fb48998f21e96ce3a5767e49638ad092d
# Parent  737edf5ac0088867cbd9d6d0d85958d594ee6c05
asm: saoCuOrgE3 avx2 code: 502c->473c

diff -r 737edf5ac008 -r 55c9c34fb489 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Apr 15 10:58:54 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 15 14:24:56 2015 +0530
@@ -1596,6 +1596,7 @@
         p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
         p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
         p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
+        p.saoCuOrgE3 = x265_saoCuOrgE3_avx2;
         p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
diff -r 737edf5ac008 -r 55c9c34fb489 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Wed Apr 15 10:58:54 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Wed Apr 15 14:24:56 2015 +0530
@@ -520,6 +520,68 @@
     movhps          [r1 + r5 - 1], m7
     RET
 
+INIT_YMM avx2
+cglobal saoCuOrgE3, 3, 6, 8
+    mov             r3d,  r3m
+    mov             r4d,  r4m
+    mov             r5d,  r5m
+
+    ; save latest 2 pixels for case startX=1 or left_endX=15
+    movq            xm7,  [r0 + r5]
+    movhps          xm7,  [r1 + r5 - 1]
+
+    ; move to startX+1
+    inc             r4d
+    add             r0,   r4
+    add             r1,   r4
+    sub             r5d,  r4d
+    pxor            xm0,  xm0                     ; xm0 = 0
+    mova            xm6,  [pb_2]                  ; xm6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+    movu            xm5,  [r2]                    ; xm5 = m_iOffsetEo
+
+.loop:
+    movu            xm1,  [r0]                    ; xm1 = pRec[x]
+    movu            xm2,  [r0 + r3]               ; xm2 = pRec[x + iStride]
+
+    psubusb         xm3,  xm2,  xm1
+    psubusb         xm4,  xm1,  xm2
+    pcmpeqb         xm3,  xm0
+    pcmpeqb         xm4,  xm0
+    pcmpeqb         xm2,  xm1
+
+    pabsb           xm3,  xm3
+    por             xm4,  xm3
+    pandn           xm2,  xm4                     ; xm2 = iSignDown
+
+    movu            xm3,  [r1]                    ; xm3 = m_iUpBuff1
+
+    paddb           xm3,  xm2
+    paddb           xm3,  xm6                     ; xm3 = uiEdgeType
+
+    pshufb          xm4,  xm5,  xm3
+
+    psubb           xm3,  xm0,  xm2
+    movu            [r1 - 1],   xm3
+
+    pmovzxbw        m2,   xm1
+    pmovsxbw        m3,   xm4
+
+    paddw           m2,   m3
+    vextracti128    xm3,  m2,   1
+    packuswb        xm2,  xm3
+    movu            [r0], xm2
+
+    add             r0,   16
+    add             r1,   16
+
+    sub             r5,   16
+    jg             .loop
+
+    ; restore last pixels (up to 2)
+    movq            [r0 + r5],     xm7
+    movhps          [r1 + r5 - 1], xm7
+    RET
+
 ;=====================================================================================
 ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
 ;=====================================================================================
diff -r 737edf5ac008 -r 55c9c34fb489 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Wed Apr 15 10:58:54 2015 +0530
+++ b/source/common/x86/loopfilter.h	Wed Apr 15 14:24:56 2015 +0530
@@ -33,6 +33,7 @@
 void x265_saoCuOrgE1_2Rows_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
+void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);


More information about the x265-devel mailing list