[x265] [PATCH 1 of 6] asm: 10bpp AVX2 code for saoCuOrgE0, improved 974c->690c over SSE

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Jun 25 10:23:49 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435212794 -19800
#      Thu Jun 25 11:43:14 2015 +0530
# Node ID faec09e1ab60531924f2d919d4f283fa91bfec81
# Parent  b1af4c36f48a4500a4912373ebcda9a5540b5c15
asm: 10bpp AVX2 code for saoCuOrgE0, improved 974c->690c over SSE

diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 25 11:43:14 2015 +0530
@@ -1284,6 +1284,8 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
+
         p.cu[BLOCK_16x16].intra_pred[2]     = PFX(intra_pred_ang16_2_avx2);
         p.cu[BLOCK_16x16].intra_pred[3]     = PFX(intra_pred_ang16_3_avx2);
         p.cu[BLOCK_16x16].intra_pred[4]     = PFX(intra_pred_ang16_4_avx2);
diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/const-a.asm	Thu Jun 25 11:43:14 2015 +0530
@@ -41,7 +41,7 @@
 const pb_16,                times 32 db 16
 const pb_32,                times 32 db 32
 const pb_64,                times 32 db 64
-const pb_128,               times 16 db 128
+const pb_128,               times 32 db 128
 const pb_a1,                times 16 db 0xa1
 
 const pb_01,                times  8 db   0,   1
diff -r b1af4c36f48a -r faec09e1ab60 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Thu Jun 25 11:43:14 2015 +0530
@@ -235,6 +235,67 @@
 %endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE0, 4,4,9
+    vbroadcasti128  m6, [r1]
+    movzx           r1d, byte [r3]
+    neg             r1b
+    movd            xm0, r1d
+    movzx           r1d, byte [r3 + 1]
+    neg             r1b
+    movd            xm1, r1d
+    vinserti128     m0, m0, xm1, 1
+    mova            m5, [pw_1023]
+    mov             r1, r4m
+    add             r1d, r1d
+    shr             r2d, 4
+
+.loop:
+    movu            m7, [r0]
+    movu            m8, [r0 + r1]
+    movu            m2, [r0 + 2]
+    movu            m1, [r0 + r1 + 2]
+
+    pcmpgtw         m3, m7, m2
+    pcmpgtw         m2, m7
+    pcmpgtw         m4, m8, m1
+    pcmpgtw         m1, m8
+
+    packsswb        m3, m4
+    packsswb        m2, m1
+    vpermq          m3, m3, 11011000b
+    vpermq          m2, m2, 11011000b
+
+    pand            m3, [pb_1]
+    por             m3, m2
+
+    pslldq          m2, m3, 1
+    por             m2, m0
+
+    psignb          m2, [pb_128]                ; m2 = signLeft
+    pxor            m0, m0
+    palignr         m0, m3, 15
+    paddb           m3, m2
+    paddb           m3, [pb_2]                  ; m3 = uiEdgeType
+    pshufb          m2, m6, m3
+    pmovsxbw        m3, xm2                     ; offsetEo
+    vextracti128    xm2, m2, 1
+    pmovsxbw        m2, xm2
+    pxor            m4, m4
+    paddw           m7, m3
+    paddw           m8, m2
+    pmaxsw          m7, m4
+    pmaxsw          m8, m4
+    pminsw          m7, m5
+    pminsw          m8, m5
+    movu            [r0], m7
+    movu            [r0 + r1], m8
+
+    add             r0q, 32
+    dec             r2d
+    jnz             .loop
+    RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
 
     mov                 r4d,        r4m
@@ -287,6 +348,7 @@
     sub                 r2d,        16
     jnz                 .loop
     RET
+%endif
 
 ;==================================================================================================
 ; void saoCuOrgE1(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)


More information about the x265-devel mailing list