[x265] [PATCH] asm: intra_filter 10bpp sse4 code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Mon Jun 29 14:37:25 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435578547 -19800
#      Mon Jun 29 17:19:07 2015 +0530
# Node ID 60832369ebb4e1014b4080b27a0401f97af93958
# Parent  9feee64efa440c25f016d15ae982789e5393a77e
asm: intra_filter 10bpp sse4 code

Performance improved over C code:
intra_filter_32x32 7.46x    525.64          3922.56
intra_filter_16x16 6.53x    289.11          1886.86
intra_filter_8x8   5.60x    170.75          956.81
intra_filter_4x4   3.05x    121.20          369.74

diff -r 9feee64efa44 -r 60832369ebb4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jun 26 15:29:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jun 29 17:19:07 2015 +0530
@@ -1120,6 +1120,11 @@
         ALL_LUMA_PU(satd, pixel_satd, sse4);
         ASSIGN_SA8D(sse4);
 
+        p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
+        p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
+        p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
+        p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
+
         ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
         INTRA_ANG_SSE4_COMMON(sse4);
diff -r 9feee64efa44 -r 60832369ebb4 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Fri Jun 26 15:29:51 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Mon Jun 29 17:19:07 2015 +0530
@@ -75,6 +75,9 @@
 const pw_ang16_13,                  db 14, 15,  8,  9,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
 const pw_ang16_16,                  db  0,  0,  0,  0,  0,  0, 10, 11,  8,  9,  6,  7,  2,  3,  0,  1
 
+intra_filter4_shuf0:                db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
+intra_filter4_shuf1:                db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
+
 ;; (blkSize - 1 - x)
 pw_planar4_0:                       dw  3,  2,  1,  0,  3,  2,  1,  0
 
@@ -21634,3 +21637,413 @@
     dec    r4
     jnz    .loop
     RET
+
+;-----------------------------------------------------------------------------------
+; void intra_filter_NxN(const pixel* references, pixel* filtered)
+;-----------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_filter_4x4, 2,4,5
+    mov             r2w, word [r0 + 16]             ; topLast
+    mov             r3w, word [r0 + 32]             ; LeftLast
+
+    ; filtering top
+    movu            m0, [r0 +  0]
+    movu            m1, [r0 + 16]
+    movu            m2, [r0 + 32]
+
+    pshufb          m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] samples[i - 1]
+    palignr         m3, m1, m0, 4
+    pshufb          m3, [intra_filter4_shuf1]       ; [8 7 6 5 4 3 2 9] samples[i + 1]
+
+    psllw           m0, 1
+    paddw           m4, m3
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    ; filtering left
+    palignr         m4, m1, m1, 14
+    pinsrw          m4, [r0], 1
+    palignr         m3, m2, m1, 4
+    pshufb          m3, [intra_filter4_shuf1]
+
+    psllw           m1, 1
+    paddw           m4, m3
+    paddw           m1, m4
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    movu            [r1], m0
+    movu            [r1 + 16], m1
+    mov             [r1 + 16], r2w                  ; topLast
+    mov             [r1 + 32], r3w                  ; LeftLast
+    RET
+
+INIT_XMM sse4
+cglobal intra_filter_8x8, 2,4,6
+    mov             r2w, word [r0 + 32]             ; topLast
+    mov             r3w, word [r0 + 64]             ; LeftLast
+
+    ; filtering top
+    movu            m0, [r0]
+    movu            m1, [r0 + 16]
+    movu            m2, [r0 + 32]
+
+    pshufb          m4, m0, [intra_filter4_shuf0]
+    palignr         m5, m1, m0, 2
+    pinsrw          m5, [r0 + 34], 0
+
+    palignr         m3, m1, m0, 14
+    psllw           m0, 1
+    paddw           m4, m5
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    palignr         m4, m2, m1, 2
+    psllw           m1, 1
+    paddw           m4, m3
+    paddw           m1, m4
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+    movu            [r1], m0
+    movu            [r1 + 16], m1
+
+    ; filtering left
+    movu            m1, [r0 + 48]
+    movu            m0, [r0 + 64]
+
+    palignr         m4, m2, m2, 14
+    pinsrw          m4, [r0], 1
+    palignr         m5, m1, m2, 2
+
+    palignr         m3, m1, m2, 14
+    palignr         m0, m1, 2
+
+    psllw           m2, 1
+    paddw           m4, m5
+    paddw           m2, m4
+    paddw           m2, [pw_2]
+    psrlw           m2, 2
+
+    psllw           m1, 1
+    paddw           m0, m3
+    paddw           m1, m0
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    movu            [r1 + 32], m2
+    movu            [r1 + 48], m1
+    mov             [r1 + 32], r2w                  ; topLast
+    mov             [r1 + 64], r3w                  ; LeftLast
+    RET
+
+INIT_XMM sse4
+cglobal intra_filter_16x16, 2,4,6
+    mov             r2w, word [r0 +  64]            ; topLast
+    mov             r3w, word [r0 + 128]            ; LeftLast
+
+    ; filtering top
+    movu            m0, [r0]
+    movu            m1, [r0 + 16]
+    movu            m2, [r0 + 32]
+
+    pshufb          m4, m0, [intra_filter4_shuf0]
+    palignr         m5, m1, m0, 2
+    pinsrw          m5, [r0 + 66], 0
+
+    palignr         m3, m1, m0, 14
+    psllw           m0, 1
+    paddw           m4, m5
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    palignr         m4, m2, m1, 2
+    psllw           m5, m1, 1
+    paddw           m4, m3
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    movu            [r1], m0
+    movu            [r1 + 16], m5
+
+    movu            m0, [r0 + 48]
+    movu            m5, [r0 + 64]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m0, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m0, m2, 14
+    palignr         m4, m5, m0, 2
+
+    psllw           m0, 1
+    paddw           m4, m3
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+    movu            [r1 + 32], m1
+    movu            [r1 + 48], m0
+
+    ; filtering left
+    movu            m1, [r0 + 80]
+    movu            m2, [r0 + 96]
+
+    palignr         m4, m5, m5, 14
+    pinsrw          m4, [r0], 1
+    palignr         m0, m1, m5, 2
+
+    psllw           m3, m5, 1
+    paddw           m4, m0
+    paddw           m3, m4
+    paddw           m3, [pw_2]
+    psrlw           m3, 2
+
+    palignr         m0, m1, m5, 14
+    palignr         m4, m2, m1, 2
+
+    psllw           m5, m1, 1
+    paddw           m4, m0
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    movu            [r1 + 64], m3
+    movu            [r1 + 80], m5
+
+    movu            m5, [r0 + 112]
+    movu            m0, [r0 + 128]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m5, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m5, m2, 14
+    palignr         m4, m0, m5, 2
+
+    psllw           m5, 1
+    paddw           m4, m3
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    movu            [r1 +  96], m1
+    movu            [r1 + 112], m5
+
+    mov             [r1 +  64], r2w                 ; topLast
+    mov             [r1 + 128], r3w                 ; LeftLast
+    RET
+
+INIT_XMM sse4
+cglobal intra_filter_32x32, 2,4,6
+    mov             r2w, word [r0 + 128]            ; topLast
+    mov             r3w, word [r0 + 256]            ; LeftLast
+
+    ; filtering top
+    ; 0 to 15
+    movu            m0, [r0 +  0]
+    movu            m1, [r0 + 16]
+    movu            m2, [r0 + 32]
+
+    pshufb          m4, m0, [intra_filter4_shuf0]
+    palignr         m5, m1, m0, 2
+    pinsrw          m5, [r0 + 130], 0
+
+    palignr         m3, m1, m0, 14
+    psllw           m0, 1
+    paddw           m4, m5
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    palignr         m4, m2, m1, 2
+    psllw           m5, m1, 1
+    paddw           m4, m3
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    movu            [r1], m0
+    movu            [r1 + 16], m5
+
+    ; 16 to 31
+    movu            m0, [r0 + 48]
+    movu            m5, [r0 + 64]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m0, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m0, m2, 14
+    palignr         m4, m5, m0, 2
+
+    psllw           m2, m0, 1
+    paddw           m4, m3
+    paddw           m2, m4
+    paddw           m2, [pw_2]
+    psrlw           m2, 2
+    movu            [r1 + 32], m1
+    movu            [r1 + 48], m2
+
+    ; 32 to 47
+    movu            m1, [r0 + 80]
+    movu            m2, [r0 + 96]
+
+    palignr         m3, m5, m0, 14
+    palignr         m4, m1, m5, 2
+
+    psllw           m0, m5, 1
+    paddw           m3, m4
+    paddw           m0, m3
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    palignr         m3, m1, m5, 14
+    palignr         m4, m2, m1, 2
+
+    psllw           m5, m1, 1
+    paddw           m4, m3
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    movu            [r1 + 64], m0
+    movu            [r1 + 80], m5
+
+    ; 48 to 63
+    movu            m0, [r0 + 112]
+    movu            m5, [r0 + 128]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m0, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m0, m2, 14
+    palignr         m4, m5, m0, 2
+
+    psllw           m0, 1
+    paddw           m4, m3
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+    movu            [r1 +  96], m1
+    movu            [r1 + 112], m0
+
+    ; filtering left
+    ; 64 to 79
+    movu            m1, [r0 + 144]
+    movu            m2, [r0 + 160]
+
+    palignr         m4, m5, m5, 14
+    pinsrw          m4, [r0], 1
+    palignr         m0, m1, m5, 2
+
+    psllw           m3, m5, 1
+    paddw           m4, m0
+    paddw           m3, m4
+    paddw           m3, [pw_2]
+    psrlw           m3, 2
+
+    palignr         m0, m1, m5, 14
+    palignr         m4, m2, m1, 2
+
+    psllw           m5, m1, 1
+    paddw           m4, m0
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    movu            [r1 + 128], m3
+    movu            [r1 + 144], m5
+
+    ; 80 to 95
+    movu            m5, [r0 + 176]
+    movu            m0, [r0 + 192]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m5, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m5, m2, 14
+    palignr         m4, m0, m5, 2
+
+    psllw           m2, m5, 1
+    paddw           m4, m3
+    paddw           m2, m4
+    paddw           m2, [pw_2]
+    psrlw           m2, 2
+    movu            [r1 + 160], m1
+    movu            [r1 + 176], m2
+
+    ; 96 to 111
+    movu            m1, [r0 + 208]
+    movu            m2, [r0 + 224]
+
+    palignr         m3, m0, m5, 14
+    palignr         m4, m1, m0, 2
+
+    psllw           m5, m0, 1
+    paddw           m3, m4
+    paddw           m5, m3
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+
+    palignr         m3, m1, m0, 14
+    palignr         m4, m2, m1, 2
+
+    psllw           m0, m1, 1
+    paddw           m4, m3
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+    movu            [r1 + 192], m5
+    movu            [r1 + 208], m0
+
+    ; 112 to 127
+    movu            m5, [r0 + 240]
+    movu            m0, [r0 + 256]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m5, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m5, m2, 14
+    palignr         m4, m0, m5, 2
+
+    psllw           m5, 1
+    paddw           m4, m3
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    movu            [r1 + 224], m1
+    movu            [r1 + 240], m5
+
+    mov             [r1 + 128], r2w                 ; topLast
+    mov             [r1 + 256], r3w                 ; LeftLast
+    RET


More information about the x265-devel mailing list