[x265] [PATCH 4 of 4] asm: intra_filter32x32 sse4 code, improved 4050c->652c over C code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Jun 26 15:22:52 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435323958 -19800
#      Fri Jun 26 18:35:58 2015 +0530
# Node ID e04bde60af516f6f016e3e6f37d5d64e97e589f3
# Parent  1995a55f1320a029fb423f23cbfd24555c258d09
asm: intra_filter32x32 sse4 code, improved 4050c->652c over C code

diff -r 1995a55f1320 -r e04bde60af51 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jun 26 18:32:00 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Jun 26 18:35:58 2015 +0530
@@ -2456,6 +2456,7 @@
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
         p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
         p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
+        p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
 
         ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
diff -r 1995a55f1320 -r e04bde60af51 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Jun 26 18:32:00 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Fri Jun 26 18:35:58 2015 +0530
@@ -18485,3 +18485,208 @@
     mov             [r1 + 32], r2b                  ; topLast
     mov             [r1 + 64], r3b                  ; LeftLast
     RET
+
+INIT_XMM sse4
+cglobal intra_filter_32x32, 2,4,6
+    mov             r2b, byte [r0 +  64]            ; topLast
+    mov             r3b, byte [r0 + 128]            ; LeftLast
+
+    ; filtering top
+    ; 0 to 15
+    pmovzxbw        m0, [r0 +  0]
+    pmovzxbw        m1, [r0 +  8]
+    pmovzxbw        m2, [r0 + 16]
+
+    pshufb          m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] samples[i - 1]
+    palignr         m5, m1, m0, 2
+    pinsrb          m5, [r0 + 65], 0                ; [8 7 6 5 4 3 2 9] samples[i + 1]
+
+    palignr         m3, m1, m0, 14
+    psllw           m0, 1
+    paddw           m4, m5
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    palignr         m4, m2, m1, 2
+    psllw           m5, m1, 1
+    paddw           m4, m3
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    packuswb        m0, m5
+    movu            [r1], m0
+
+    ; 16 to 31
+    pmovzxbw        m0, [r0 + 24]
+    pmovzxbw        m5, [r0 + 32]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m0, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m0, m2, 14
+    palignr         m4, m5, m0, 2
+
+    psllw           m2, m0, 1
+    paddw           m4, m3
+    paddw           m2, m4
+    paddw           m2, [pw_2]
+    psrlw           m2, 2
+    packuswb        m1, m2
+    movu            [r1 + 16], m1
+
+    ; 32 to 47
+    pmovzxbw        m1, [r0 + 40]
+    pmovzxbw        m2, [r0 + 48]
+
+    palignr         m3, m5, m0, 14
+    palignr         m4, m1, m5, 2
+
+    psllw           m0, m5, 1
+    paddw           m3, m4
+    paddw           m0, m3
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    palignr         m3, m1, m5, 14
+    palignr         m4, m2, m1, 2
+
+    psllw           m5, m1, 1
+    paddw           m4, m3
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    packuswb        m0, m5
+    movu            [r1 + 32], m0
+
+    ; 48 to 63
+    pmovzxbw        m0, [r0 + 56]
+    pmovzxbw        m5, [r0 + 64]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m0, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m0, m2, 14
+    palignr         m4, m5, m0, 2
+
+    psllw           m0, 1
+    paddw           m4, m3
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+    packuswb        m1, m0
+    movu            [r1 + 48], m1
+
+    ; filtering left
+    ; 64 to 79
+    pmovzxbw        m1, [r0 + 72]
+    pmovzxbw        m2, [r0 + 80]
+
+    palignr         m4, m5, m5, 14
+    pinsrb          m4, [r0], 2
+    palignr         m0, m1, m5, 2
+
+    psllw           m3, m5, 1
+    paddw           m4, m0
+    paddw           m3, m4
+    paddw           m3, [pw_2]
+    psrlw           m3, 2
+
+    palignr         m0, m1, m5, 14
+    palignr         m4, m2, m1, 2
+
+    psllw           m5, m1, 1
+    paddw           m4, m0
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    packuswb        m3, m5
+    movu            [r1 + 64], m3
+
+    ; 80 to 95
+    pmovzxbw        m5, [r0 + 88]
+    pmovzxbw        m0, [r0 + 96]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m5, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m5, m2, 14
+    palignr         m4, m0, m5, 2
+
+    psllw           m2, m5, 1
+    paddw           m4, m3
+    paddw           m2, m4
+    paddw           m2, [pw_2]
+    psrlw           m2, 2
+    packuswb        m1, m2
+    movu            [r1 + 80], m1
+
+    ; 96 to 111
+    pmovzxbw        m1, [r0 + 104]
+    pmovzxbw        m2, [r0 + 112]
+
+    palignr         m3, m0, m5, 14
+    palignr         m4, m1, m0, 2
+
+    psllw           m5, m0, 1
+    paddw           m3, m4
+    paddw           m5, m3
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+
+    palignr         m3, m1, m0, 14
+    palignr         m4, m2, m1, 2
+
+    psllw           m0, m1, 1
+    paddw           m4, m3
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+    packuswb        m5, m0
+    movu            [r1 + 96], m5
+
+    ; 112 to 127
+    pmovzxbw        m5, [r0 + 120]
+    pmovzxbw        m0, [r0 + 128]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m5, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m5, m2, 14
+    palignr         m4, m0, m5, 2
+
+    psllw           m5, 1
+    paddw           m4, m3
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    packuswb        m1, m5
+    movu            [r1 + 112], m1
+
+    mov             [r1 +  64], r2b                 ; topLast
+    mov             [r1 + 128], r3b                 ; LeftLast
+    RET


More information about the x265-devel mailing list