[x265-commits] [x265] asm: intra_filter4x4 avx2 code, improved 8bit: 141c->118c...

Dnyaneshwar G dnyaneshwar at multicorewareinc.com
Thu Jul 2 17:55:48 CEST 2015


details:   http://hg.videolan.org/x265/rev/1e41b3bcd911
branches:  
changeset: 10743:1e41b3bcd911
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Tue Jun 30 16:52:40 2015 +0530
description:
asm: intra_filter4x4 avx2 code, improved 8bit: 141c->118c, 10bit: 121c->88c
Subject: [x265] asm: pixelavg_pp[8xN] sse2 code for 10bpp

details:   http://hg.videolan.org/x265/rev/86ab0e0e5c1a
branches:  
changeset: 10744:86ab0e0e5c1a
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Wed Jul 01 12:56:19 2015 +0530
description:
asm: pixelavg_pp[8xN] sse2 code for 10bpp

     avg_pp[  8x4]  5.12x    125.34          641.61
     avg_pp[  8x8]  5.95x    202.53          1205.34
     avg_pp[ 8x16]  6.94x    334.54          2322.57
     avg_pp[ 8x32]  8.15x    589.39          4806.23
Subject: [x265] testbench: fix bug in plane_copy_cp, detect outside bound now

details:   http://hg.videolan.org/x265/rev/027ae320032e
branches:  
changeset: 10745:027ae320032e
user:      Min Chen <chenm003 at 163.com>
date:      Wed Jul 01 17:05:48 2015 -0700
description:
testbench: fix bug in plane_copy_cp, detect outside bound now
Subject: [x265] asm: fix buffer overwrite bug in upShift_8_avx2

details:   http://hg.videolan.org/x265/rev/76a314f91799
branches:  
changeset: 10746:76a314f91799
user:      Min Chen <chenm003 at 163.com>
date:      Wed Jul 01 17:05:52 2015 -0700
description:
asm: fix buffer overwrite bug in upShift_8_avx2

diffstat:

 source/common/x86/asm-primitives.cpp |    4 +
 source/common/x86/intrapred16.asm    |   27 ++++++++
 source/common/x86/intrapred8.asm     |   34 ++++++++++-
 source/common/x86/mc-a.asm           |   86 ++++++++++++++++++++++++++-
 source/common/x86/pixel-a.asm        |  107 ++++++++++++----------------------
 source/test/pixelharness.cpp         |    6 +-
 6 files changed, 184 insertions(+), 80 deletions(-)

diffs (truncated from 383 to 300 lines):

diff -r 68d089360477 -r 76a314f91799 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jul 01 17:05:52 2015 -0700
@@ -1290,6 +1290,8 @@ void setupAssemblyPrimitives(EncoderPrim
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
         p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
         p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
         p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
@@ -2619,6 +2621,8 @@ void setupAssemblyPrimitives(EncoderPrim
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
         p.planecopy_sp = PFX(downShift_16_avx2);
 
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
diff -r 68d089360477 -r 76a314f91799 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Wed Jul 01 17:05:52 2015 -0700
@@ -77,6 +77,7 @@ const pw_ang16_16,                  db  
 
 intra_filter4_shuf0:                db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
 intra_filter4_shuf1:                db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
+intra_filter4_shuf2:        times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
 
 ;; (blkSize - 1 - x)
 pw_planar4_0:                       dw  3,  2,  1,  0,  3,  2,  1,  0
@@ -22047,3 +22048,29 @@ cglobal intra_filter_32x32, 2,4,6
     mov             [r1 + 128], r2w                 ; topLast
     mov             [r1 + 256], r3w                 ; LeftLast
     RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+    mov             r2w, word [r0 + 16]         ; topLast
+    mov             r3w, word [r0 + 32]         ; LeftLast
+
+    ; filtering top
+    movu            m0, [r0]
+    vpbroadcastw    m2, xm0
+    movu            m1, [r0 + 16]
+
+    palignr         m3, m0, m2, 14              ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
+    pshufb          m3, [intra_filter4_shuf2]   ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
+    palignr         m1, m0, 4                   ; [9 8 7 6 5 4 3 2]
+    palignr         m1, m1, 14                  ; [9 8 7 6 5 4 3 2]
+
+    psllw           m0, 1
+    paddw           m3, m1
+    paddw           m0, m3
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    movu            [r1], m0
+    mov             [r1 + 16], r2w              ; topLast
+    mov             [r1 + 32], r3w              ; LeftLast
+    RET
diff -r 68d089360477 -r 76a314f91799 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Wed Jul 01 17:05:52 2015 -0700
@@ -30,8 +30,9 @@ SECTION_RODATA 32
 intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-intra_filter4_shuf0:  db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
-intra_filter4_shuf1:  db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
+intra_filter4_shuf1:  times 2 db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
+intra_filter4_shuf2:  times 2 db  4,  5,  0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
 
 pb_0_8        times 8 db  0,  8
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
@@ -18690,3 +18691,32 @@ cglobal intra_filter_32x32, 2,4,6
     mov             [r1 +  64], r2b                 ; topLast
     mov             [r1 + 128], r3b                 ; LeftLast
     RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+    mov             r2b, byte [r0 +  8]         ; topLast
+    mov             r3b, byte [r0 + 16]         ; LeftLast
+
+    ; filtering top
+    pmovzxbw        m0, [r0]
+    vpbroadcastw    m2, xm0
+    pmovzxbw        m1, [r0 + 8]
+
+    palignr         m3, m0, m2, 14              ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
+    pshufb          m3, [intra_filter4_shuf2]   ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
+    palignr         m1, m0, 4                   ; [9 8 7 6 5 4 3 2]
+    palignr         m1, m1, 14                  ; [9 8 7 6 5 4 3 2]
+
+    psllw           m0, 1
+    paddw           m3, m1
+    paddw           m0, m3
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    packuswb        m0, m0
+    vpermq          m0, m0, 10001000b
+
+    movu            [r1], xm0
+    mov             [r1 +  8], r2b              ; topLast
+    mov             [r1 + 16], r3b              ; LeftLast
+    RET
diff -r 68d089360477 -r 76a314f91799 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/mc-a.asm	Wed Jul 01 17:05:52 2015 -0700
@@ -4009,6 +4009,87 @@ cglobal pixel_avg_w%1
     AVG_END
 %endmacro
 
+%macro  pixel_avg_W8 0
+    movu    m0, [r2]
+    movu    m1, [r4]
+    pavgw   m0, m1
+    movu    [r0], m0
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m2, m3
+    movu    [r0 + r1], m2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    pavgw   m0, m1
+    movu    [r0 + r1 * 2], m0
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m2, m3
+    movu    [r0 + r8], m2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal pixel_avg_8x4, 6,9,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    pixel_avg_W8
+    RET
+
+cglobal pixel_avg_8x8, 6,9,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    pixel_avg_W8
+    pixel_avg_W8
+    RET
+
+cglobal pixel_avg_8x16, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 4
+.loop
+    pixel_avg_W8
+    dec     r9d
+    jnz     .loop
+    RET
+
+cglobal pixel_avg_8x32, 6,10,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+    mov     r9d, 8
+.loop
+    pixel_avg_W8
+    dec     r9d
+    jnz     .loop
+    RET
+%endif
+%endif
+
 %if HIGH_BIT_DEPTH
 
 INIT_MMX mmx2
@@ -4060,11 +4141,6 @@ AVGH  4, 8
 AVGH  4, 4
 AVGH  4, 2
 
-AVG_FUNC 8, movdqu, movdqa
-AVGH  8, 32
-AVGH  8, 16
-AVGH  8,  8
-AVGH  8,  4
 
 AVG_FUNC 16, movdqu, movdqa
 AVGH  16, 64
diff -r 68d089360477 -r 76a314f91799 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/pixel-a.asm	Wed Jul 01 17:05:52 2015 -0700
@@ -69,6 +69,7 @@ cextern pd_1
 cextern popcnt_table
 cextern pd_2
 cextern hmul_16p
+cextern pb_movemask
 
 ;=============================================================================
 ; SATD
@@ -7299,10 +7300,9 @@ cglobal downShift_16, 6,7,3
 ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
 ;---------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal upShift_8, 7,7,3
-
-    movd        m2, r6d        ; m0 = shift
-    add         r3, r3
+cglobal upShift_8, 6,7,3
+    movd        m2, r6m        ; m0 = shift
+    add         r3d, r3d
     dec         r5d
 
 .loopH:
@@ -7393,88 +7393,55 @@ cglobal upShift_8, 7,7,3
 ;---------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_YMM avx2
-cglobal upShift_8, 7,8,3
-    movd        xm2, r6d
-    add         r3, r3
+cglobal upShift_8, 6,7,4
+    movd        xm2, r6m
+    add         r3d, r3d
+    dec         r5d
 
 .loopH:
-    xor         r7, r7
-    mov         r6d, r4d
+    xor         r6, r6
 .loopW:
-    pmovzxbw    m0,[r0 + r7]
-    pmovzxbw    m1,[r0 + r7 + 16]
+    pmovzxbw    m0,[r0 + r6]
+    pmovzxbw    m1,[r0 + r6 + mmsize/2]
     psllw       m0, xm2
     psllw       m1, xm2
-    movu        [r2 + r7 * 2], m0
-    movu        [r2 + r7 * 2 + 32], m1
-
-    add         r7d, 32
-    sub         r6d, 32
-    jg          .loopW
+    movu        [r2 + r6 * 2], m0
+    movu        [r2 + r6 * 2 + mmsize], m1
+
+    add         r6d, mmsize
+    cmp         r6d, r4d
+    jl         .loopW
 
     ; move to next row
     add         r0, r1
     add         r2, r3
     dec         r5d
-    jnz         .loopH
-
-;processing last row of every frame [To handle width which not a multiple of 16]
-
-.loop16:
+    jg         .loopH
+
+    ; processing last row of every frame [To handle width which not a multiple of 32]
+    lea         r3, [pb_movemask + 16]
+    mov         r5d, 15
+    and         r5d, r4d
+    sub         r3, r5
+    pmovsxbw    m3, [r3]
+
+    ; NOTE: Width MUST BE more than or equal to 16
+    shr         r4d, 4
+.loopW2:
     pmovzxbw    m0,[r0]
     psllw       m0, xm2
     movu        [r2], m0
-
-    add         r0, mmsize


More information about the x265-commits mailing list