[vlc-devel] [PATCH 14/25] packetizer: add startcode_FindAnnexB AVX2 asm

Victorien Le Couviour--Tuffet victorien.lecouviour.tuffet at gmail.com
Tue Apr 14 12:40:25 CEST 2020


---
 modules/packetizer/startcode.asm      | 103 ++++++++++++++++++++++++++
 modules/packetizer/startcode_helper.h |   9 +++
 test/modules/packetizer/helpers.c     |  10 +++
 3 files changed, 122 insertions(+)

diff --git a/modules/packetizer/startcode.asm b/modules/packetizer/startcode.asm
index cac5eea116..a74463dc1b 100644
--- a/modules/packetizer/startcode.asm
+++ b/modules/packetizer/startcode.asm
@@ -171,3 +171,106 @@ DEFINE_ARGS ptr, _, size, tmp
     add            r3d, r4d
     lea            rax, [ptrq+r3q]
     RET
+
+%if ARCH_X86_64
+
+INIT_YMM avx2
+cglobal startcode_FindAnnexB, 2, 7, 5, ptr, end, size
+%define base r6q-shuf_pad2dw
+    lea            r6q, [shuf_pad2dw]
+    vbroadcasti128  m0, [base+shuf_pad2dw]
+    vbroadcasti128  m1, [base+pd_0x00010000]
+    mov          sizeq, endq
+    sub          sizeq, ptrq
+    cmp          sizeq, 28
+    jl .xmm
+
+.loop:
+    movu           xm2, [ptrq+ 0]
+    vinserti128     m2, [ptrq+12], 1
+    psrldq          m3, m2, 1
+    psrldq          m4, m2, 2
+    pshufb          m2, m0
+    pshufb          m3, m0
+    pshufb          m4, m0
+    pcmpeqd         m2, m1
+    pcmpeqd         m3, m1
+    pcmpeqd         m4, m1
+    movmskps       r3d, m2
+    movmskps       r4d, m3
+    movmskps       r5d, m4
+    tzcnt          r3d, r3d
+    tzcnt          r4d, r4d
+    tzcnt          r5d, r5d
+    cmp            r4d, r3d
+    cmovle         r3d, r4d
+    setle          r4b
+    cmp            r5d, r3d
+    cmovle         r3d, r5d
+    setle          r5b
+    test           r3d, 32
+    jz .found
+    add           ptrq, 24
+    sub          sized, 24
+    cmp          sized, 28
+    jge .loop
+
+INIT_XMM avx2
+.xmm:
+    cmp          sized, 16
+    jl .end
+    movu            m2, [ptrq]
+    psrldq          m3, m2, 1
+    psrldq          m4, m2, 2
+    pshufb          m2, m0
+    pshufb          m3, m0
+    pshufb          m4, m0
+    pcmpeqd         m2, m1
+    pcmpeqd         m3, m1
+    pcmpeqd         m4, m1
+    movmskps       r3d, m2
+    movmskps       r4d, m3
+    movmskps       r5d, m4
+    tzcnt          r3d, r3d
+    tzcnt          r4d, r4d
+    tzcnt          r5d, r5d
+    cmp            r4d, r3d
+    cmovle         r3d, r4d
+    setle          r4b
+    cmp            r5d, r3d
+    cmovle         r3d, r5d
+    setle          r5b
+    test           r3d, 32
+    jnz .end
+
+.found:
+    lea            r3d, [r3d*3]
+    mov            r6d, r4d
+    xor            r4d, r5d
+    and            r4d, r6d
+    shl            r5d, 1
+    or             r4d, r5d
+    add            r3d, r4d
+    lea            rax, [ptrq+r3q]
+    RET
+
+DEFINE_ARGS ptr, _, size, tmp
+.end:
+    sub          sized, 3
+    jl .ret_null
+.end_loop:
+    xor           tmpd, tmpd
+    test   word [ptrq], 0xFFFF
+    cmovz         tmpw, [ptrq+1]
+    xor           tmpd, 0x0100
+    jz .ret
+    inc           ptrq
+    dec          sized
+    jge .end_loop
+.ret_null:
+    xor           ptrq, ptrq
+.ret:
+    mov            rax, ptrq
+    RET
+
+%endif
diff --git a/modules/packetizer/startcode_helper.h b/modules/packetizer/startcode_helper.h
index 73425e9970..d325aca7d7 100644
--- a/modules/packetizer/startcode_helper.h
+++ b/modules/packetizer/startcode_helper.h
@@ -72,12 +72,21 @@ uint8_t const *vlcpriv_startcode_FindAnnexB_sse2(uint8_t const *ptr,
                                                  uint8_t const *end);
 uint8_t const *vlcpriv_startcode_FindAnnexB_ssse3(uint8_t const *ptr,
                                                   uint8_t const *end);
+# ifdef __x86_64__
+uint8_t const *vlcpriv_startcode_FindAnnexB_avx2(uint8_t const *ptr,
+                                                 uint8_t const *end);
+# endif
 #endif
 
 static inline block_startcode_helper_t
 startcode_FindAnnexB_helper(void)
 {
 #if defined(__i386__) || defined(__x86_64__)
+# ifdef __x86_64__
+    if (vlc_CPU_AVX2())
+        return vlcpriv_startcode_FindAnnexB_avx2;
+    else
+# endif
     if (vlc_CPU_SSSE3())
         return vlcpriv_startcode_FindAnnexB_ssse3;
     else if (vlc_CPU_SSE2())
diff --git a/test/modules/packetizer/helpers.c b/test/modules/packetizer/helpers.c
index 64e982c2d5..ebca21ae22 100644
--- a/test/modules/packetizer/helpers.c
+++ b/test/modules/packetizer/helpers.c
@@ -94,6 +94,16 @@ static int run_annexb_sets( const uint8_t *p_set, const uint8_t *p_end,
         if( i_ret != 0 )
             return i_ret;
     }
+# ifdef __x86_64__
+    if (vlc_CPU_AVX2())
+    {
+        printf("checking AVX2 asm:\n");
+        i_ret = check_set( p_set, p_end, p_results, i_results, i_results_offset,
+                           vlcpriv_startcode_FindAnnexB_avx2 );
+        if( i_ret != 0 )
+            return i_ret;
+    }
+# endif
 #endif
 
     return 0;
-- 
2.24.1



More information about the vlc-devel mailing list