[x265] [PATCH 6 of 6] asm: 10bpp AVX2 code for saoCuOrgB0, improved 23127c->15595c over SSE

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Jun 25 10:23:54 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435219949 -19800
#      Thu Jun 25 13:42:29 2015 +0530
# Node ID f1ff5636cba3e2b714ceed86261362a53e8c6aca
# Parent  85d5582eedd40e4227131bff366235e6dc2b361a
asm: 10bpp AVX2 code for saoCuOrgB0, improved 23127c->15595c over SSE

diff -r 85d5582eedd4 -r f1ff5636cba3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jun 25 12:11:45 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 25 13:42:29 2015 +0530
@@ -1291,6 +1291,7 @@
         p.saoCuOrgE2[1] = PFX(saoCuOrgE2_32_avx2);
         p.saoCuOrgE3[0] = PFX(saoCuOrgE3_avx2);
         p.saoCuOrgE3[1] = PFX(saoCuOrgE3_32_avx2);
+        p.saoCuOrgB0 = PFX(saoCuOrgB0_avx2);
 
         p.cu[BLOCK_16x16].intra_pred[2]     = PFX(intra_pred_ang16_2_avx2);
         p.cu[BLOCK_16x16].intra_pred[3]     = PFX(intra_pred_ang16_3_avx2);
diff -r 85d5582eedd4 -r f1ff5636cba3 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Thu Jun 25 12:11:45 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Thu Jun 25 13:42:29 2015 +0530
@@ -1643,6 +1643,89 @@
 %endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgB0, 5,7,8
+    vbroadcasti128  m3, [r1]
+    vbroadcasti128  m4, [r1 + 16]
+    add             r4d, r4d
+    lea             r1, [r4 * 2]
+    sub             r1d, r2d
+    sub             r1d, r2d
+    shr             r2d, 4
+    mova            m7, [pw_1023]
+
+    mov             r6d, r3d
+    shr             r3d, 1
+
+.loopH
+    mov             r5d, r2d
+.loopW
+    movu            m2, [r0]
+    movu            m5, [r0 + r4]
+    psrlw           m0, m2, 5
+    psrlw           m6, m5, 5
+    packuswb        m0, m6
+    vpermq          m0, m0, 11011000b
+    pand            m0, [pb_31]         ; m0 = [index]
+
+    pshufb          m6, m3, m0
+    pshufb          m1, m4, m0
+    pcmpgtb         m0, [pb_15]         ; m0 = [mask]
+
+    pblendvb        m6, m6, m1, m0      ; NOTE: don't use 3 parameters style, x264 macro have some bug!
+
+    pmovsxbw        m0, xm6
+    vextracti128    xm6, m6, 1
+    pmovsxbw        m6, xm6
+
+    paddw           m2, m0
+    paddw           m5, m6
+    pxor            m1, m1
+    pmaxsw          m2, m1
+    pmaxsw          m5, m1
+    pminsw          m2, m7
+    pminsw          m5, m7
+
+    movu            [r0], m2
+    movu            [r0 + r4], m5
+
+    add             r0, 32
+    dec             r5d
+    jnz             .loopW
+
+    add             r0, r1
+    dec             r3d
+    jnz             .loopH
+
+    test            r6b, 1
+    jz              .end
+    xor             r1, r1
+.loopW1:
+    movu            m2, [r0 + r1]
+    psrlw           m0, m2, 5
+    packuswb        m0, m0
+    vpermq          m0, m0, 10001000b
+    pand            m0, [pb_31]         ; m0 = [index]
+
+    pshufb          m6, m3, m0
+    pshufb          m1, m4, m0
+    pcmpgtb         m0, [pb_15]         ; m0 = [mask]
+
+    pblendvb        m6, m6, m1, m0      ; NOTE: don't use 3 parameters style, x264 macro have some bug!
+    pmovsxbw        m0, xm6             ; offset
+
+    paddw           m2, m0
+    pxor            m0, m0
+    pmaxsw          m2, m0
+    pminsw          m2, m7
+
+    movu            [r0 + r1], m2
+    add             r1d, 32
+    dec             r2d
+    jnz             .loopW1
+.end:
+    RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgB0, 4, 7, 8
 
     mov             r3d,        r3m
@@ -1717,6 +1800,7 @@
     jnz             .loopW1
 .end
     RET
+%endif
 
 ;============================================================================================================
 ; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)


More information about the x265-devel mailing list