[x265] [PATCH] asm: intra_filter4x4 avx2 code, improved 8bit: 141c->118c, 10bit: 121c->88c

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jun 30 14:02:14 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435663360 -19800
#      Tue Jun 30 16:52:40 2015 +0530
# Node ID 9340454d3b551f57ba9ce6a3f77fade041975e62
# Parent  b1301944894051b9641006797e4d6253b277f3e4
asm: intra_filter4x4 avx2 code, improved 8bit: 141c->118c, 10bit: 121c->88c

diff -r b13019448940 -r 9340454d3b55 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jun 29 17:19:07 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jun 30 16:52:40 2015 +0530
@@ -1290,6 +1290,8 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
         p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
         p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
         p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
@@ -2619,6 +2621,8 @@
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
         p.planecopy_sp = PFX(downShift_16_avx2);
 
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
diff -r b13019448940 -r 9340454d3b55 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Mon Jun 29 17:19:07 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Tue Jun 30 16:52:40 2015 +0530
@@ -77,6 +77,7 @@
 
 intra_filter4_shuf0:                db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
 intra_filter4_shuf1:                db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
+intra_filter4_shuf2:        times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
 
 ;; (blkSize - 1 - x)
 pw_planar4_0:                       dw  3,  2,  1,  0,  3,  2,  1,  0
@@ -22047,3 +22048,29 @@
     mov             [r1 + 128], r2w                 ; topLast
     mov             [r1 + 256], r3w                 ; LeftLast
     RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+    mov             r2w, word [r0 + 16]         ; topLast
+    mov             r3w, word [r0 + 32]         ; LeftLast
+
+    ; filtering top
+    movu            m0, [r0]
+    vpbroadcastw    m2, xm0
+    movu            m1, [r0 + 16]
+
+    palignr         m3, m0, m2, 14              ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
+    pshufb          m3, [intra_filter4_shuf2]   ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
+    palignr         m1, m0, 4                   ; [9 8 7 6 5 4 3 2]
+    palignr         m1, m1, 14                  ; [9 8 7 6 5 4 3 2]
+
+    psllw           m0, 1
+    paddw           m3, m1
+    paddw           m0, m3
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    movu            [r1], m0
+    mov             [r1 + 16], r2w              ; topLast
+    mov             [r1 + 32], r3w              ; LeftLast
+    RET
diff -r b13019448940 -r 9340454d3b55 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Jun 29 17:19:07 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Jun 30 16:52:40 2015 +0530
@@ -30,8 +30,9 @@
 intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-intra_filter4_shuf0:  db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
-intra_filter4_shuf1:  db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
+intra_filter4_shuf1:  times 2 db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
+intra_filter4_shuf2:  times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
 
 pb_0_8        times 8 db  0,  8
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
@@ -18690,3 +18691,32 @@
     mov             [r1 +  64], r2b                 ; topLast
     mov             [r1 + 128], r3b                 ; LeftLast
     RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+    mov             r2b, byte [r0 +  8]         ; topLast
+    mov             r3b, byte [r0 + 16]         ; LeftLast
+
+    ; filtering top
+    pmovzxbw        m0, [r0]
+    vpbroadcastw    m2, xm0
+    pmovzxbw        m1, [r0 + 8]
+
+    palignr         m3, m0, m2, 14              ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
+    pshufb          m3, [intra_filter4_shuf2]   ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
+    palignr         m1, m0, 4                   ; [9 8 7 6 5 4 3 2]
+    palignr         m1, m1, 14                  ; [9 8 7 6 5 4 3 2]
+
+    psllw           m0, 1
+    paddw           m3, m1
+    paddw           m0, m3
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    packuswb        m0, m0
+    vpermq          m0, m0, 10001000b
+
+    movu            [r1], xm0
+    mov             [r1 +  8], r2b              ; topLast
+    mov             [r1 + 16], r3b              ; LeftLast
+    RET


More information about the x265-devel mailing list