[x265] [PATCH] asm: saoCuOrgE1 avx2 code: 403c->331c

Divya Manivannan divya at multicorewareinc.com
Wed Apr 8 11:32:37 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1428485346 -19800
#      Wed Apr 08 14:59:06 2015 +0530
# Node ID 125a429989d35695971e1a1a8cff20c0b2649074
# Parent  7044924d68147152533fe1502df2c75a3512befb
asm: saoCuOrgE1 avx2 code: 403c->331c

diff -r 7044924d6814 -r 125a429989d3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 07 17:11:42 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 08 14:59:06 2015 +0530
@@ -1557,6 +1557,7 @@
         p.dst4x4 = x265_dst4_avx2;
         p.scale2D_64to32 = x265_scale2D_64to32_avx2;
         p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
+        p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r 7044924d6814 -r 125a429989d3 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Apr 07 17:11:42 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Wed Apr 08 14:59:06 2015 +0530
@@ -233,6 +233,49 @@
     jnz         .loop
     RET
 
+INIT_YMM avx2
+cglobal saoCuOrgE1, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
+    mov           r3d,    r3m
+    mov           r4d,    r4m
+    movu          xm0,    [r2]                    ; xm0 = m_iOffsetEo
+    mova          xm6,    [pb_2]                  ; xm6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+    mova          xm7,    [pb_128]
+    shr           r4d,    4
+.loop
+    movu          xm1,    [r0]                    ; xm1 = pRec[x]
+    movu          xm2,    [r0 + r3]               ; xm2 = pRec[x + iStride]
+
+    pxor          xm3,    xm1,    xm7
+    pxor          xm4,    xm2,    xm7
+    pcmpgtb       xm2,    xm3,    xm4
+    pcmpgtb       xm4,    xm3
+    pand          xm2,    [pb_1]
+    por           xm2,    xm4
+
+    movu          xm3,    [r1]                    ; xm3 = m_iUpBuff1
+
+    paddb         xm3,    xm2
+    paddb         xm3,    xm6
+
+    pshufb        xm5,    xm0,    xm3
+    pxor          xm4,    xm4
+    psubb         xm3,    xm4,    xm2
+    movu          [r1],   xm3
+
+    pmovzxbw      m2,     xm1
+    pmovsxbw      m3,     xm5
+
+    paddw         m2,     m3
+    vextracti128  xm3,    m2,     1
+    packuswb      xm2,    xm3
+    movu          [r0],   xm2
+
+    add           r0,     16
+    add           r1,     16
+    dec           r4d
+    jnz           .loop
+    RET
+
 ;========================================================================================================
 ; void saoCuOrgE1_2Rows(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)
 ;========================================================================================================
diff -r 7044924d6814 -r 125a429989d3 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Apr 07 17:11:42 2015 +0530
+++ b/source/common/x86/loopfilter.h	Wed Apr 08 14:59:06 2015 +0530
@@ -28,6 +28,7 @@
 void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
 void x265_saoCuOrgE0_avx2(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
 void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+void x265_saoCuOrgE1_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);


More information about the x265-devel mailing list