[x265] [PATCH] arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon

Ramya Sriraman ramya at multicorewareinc.com
Tue Mar 15 14:05:34 CET 2016


Thanks min,
I referred to ffmpeg's NEON version of the filter and tried something
similar in design for 8x8 for a start. Right now, the filter works for
coefficient index 0 and 1 only. Is this what you had in mind for better
performance ?
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
.macro qpel_filter_1
        vmov.i16        d16, #58
        vmovl.u8        q11, d3
        vmull.s16       q9, d22, d16   // 58 * d0
        vmull.s16       q10, d23, d16   // 58 * d1

        vmov.i16        d17, #10
        vmovl.u8        q13, d2
        vmull.s16       q11, d26, d17   // 10 * c0
        vmull.s16       q12, d27, d17   // 10 * c1

        vmov.i16        d16, #17
        vmovl.u8        q15, d4
        vmull.s16       q13, d30, d16   // 17 * e0
        vmull.s16       q14, d31, d16   // 17 * e1

        vmov.i16        d17, #5
        vmovl.u8        q1, d5
        vmull.s16       q15, d2, d17  //  5 * f0
        vmull.s16       q8, d3, d17  //  5 * f1

        vsub.s32        q9, q11       // 58 * d0 - 10 * c0
        vsub.s32        q10, q12       // 58 * d1 - 10 * c1

        vmovl.u8        q1, d1
        vshll.s16       q11, d2, #2    // 4 * b0
        vshll.s16       q12, d3, #2    // 4 * b1

        vadd.s32        q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
        vadd.s32        q10, q14       // 58 * d1 - 10 * c1 + 17 * e1

        vmovl.u8        q1, d0
        vmovl.u8        q2, d6
        vsubl.s16       q13, d4, d2   // g0 - a0
        vsubl.s16       q14, d5, d3   // g1 - a1

        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 +
g0 - a0 - 5 * f0
        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 +
g1 - a1 - 5 * f1
.endm
.macro qpel_filter_2
 .endm
.macro qpel_filter_3
.endm
.macro LUMA_VPP_8xN h
function x265_interp_8tap_vert_pp_8x\h\()_neon
    push            {r4, r5, r6, r7}
    ldr             r5, [sp, #4 * 4]
    mov             r4, #3
    mul             r4, r1, r4
    sub             r0, r4
    mov             r4, #\h
.loop_8x\h:
    mov             r6,r0
    pld [r6]
    vld1.u8         d0, [r6], r1
    pld [r6]
    vld1.u8         d1, [r6], r1
    pld [r6]
    vld1.u8         d2, [r6], r1
    pld [r6]
    vld1.u8         d3, [r6], r1
    pld [r6]
    vld1.u8         d4, [r6], r1
    pld [r6]
    vld1.u8         d5, [r6], r1
    pld [r6]
    vld1.u8         d6, [r6], r1
    pld [r6]
    vld1.u8         d7, [r6], r1

    veor.u8         q9, q9
    veor.u8         q10, q10

    cmp             r5,#0
    beq              0f
    cmp             r5,#1
    beq              1f
    cmp             r5,#2
    beq              2f
    cmp             r5,#3
    beq              3f
1:
    qpel_filter_1
2:
    qpel_filter_2
3:
    qpel_filter_3
0:
    vmov.i16         d17, #64
    vmovl.u8        q11, d3
    vmull.s16       q9, d22, d17   // 64*d0
    vmull.s16       q10, d23, d17   // 64*d1

    mov             r12,#32
    vdup.32         q8, r12
    vadd.s32        q9, q8
    vqshrun.s32     d0, q9, #6
    vadd.s32        q10, q8
    vqshrun.s32     d1, q10, #6
    vqmovn.u16      d0, q0
    vst1.u8         d0, [r2], r3

    add             r0, r1
    subs            r4, #1
    bne             .loop_8x\h

    pop             {r4, r5, r6, r7}
    bx              lr
endfunc
.endm

LUMA_VPP_8xN 8



Thank you
Regards
Ramya

On Mon, Mar 14, 2016 at 5:43 PM, chen <chenm003 at 163.com> wrote:

>
>
> At 2016-03-14 19:09:19,ramya at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Ramya Sriraman<ramya at multicorewareinc.com>
> ># Date 1457681682 -19800
> >#      Fri Mar 11 13:04:42 2016 +0530
> ># Node ID c1d2fa2ca49d4027252bd52176ecbe2db0d0eddd
> ># Parent  0af38750a71aab5fe790993365aaaa3e209a7d5c
> >arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon
> >
> >diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/asm-primitives.cpp
> >--- a/source/common/arm/asm-primitives.cpp	Thu Mar 10 21:43:35 2016 +0530
> >+++ b/source/common/arm/asm-primitives.cpp	Fri Mar 11 13:04:42 2016 +0530
> >@@ -290,6 +290,14 @@
> >         // planecopy
> >         p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
> >
> >+        // interpolation filters
> >+        p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_neon);
> >+        p.pu[LUMA_4x8].luma_vpp = PFX(interp_8tap_vert_pp_4x8_neon);
> >+        p.pu[LUMA_4x16].luma_vpp = PFX(interp_8tap_vert_pp_4x16_neon);
> >+        p.pu[LUMA_8x4].luma_vpp = PFX(interp_8tap_vert_pp_8x4_neon);
> >+        p.pu[LUMA_8x8].luma_vpp = PFX(interp_8tap_vert_pp_8x8_neon);
> >+        p.pu[LUMA_8x16].luma_vpp = PFX(interp_8tap_vert_pp_8x16_neon);
> >+        p.pu[LUMA_8x32].luma_vpp = PFX(interp_8tap_vert_pp_8x32_neon);
> >     }
> >     if (cpuMask & X265_CPU_ARMV6)
> >     {
> >diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/ipfilter8.S
> >--- a/source/common/arm/ipfilter8.S	Thu Mar 10 21:43:35 2016 +0530
> >+++ b/source/common/arm/ipfilter8.S	Fri Mar 11 13:04:42 2016 +0530
> >@@ -27,6 +27,8 @@
> >
> > .align 4
> >
> >+g_lumaFilter:
> >+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0,-1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0,-1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1,0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1
> > .text
> >
> > // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
> >@@ -692,3 +694,207 @@
> >     bgt         .loop_filterP2S_48x64
> >     bx          lr
> > endfunc
> >+
> >+.macro LUMA_VPP_4xN h
> >+function x265_interp_8tap_vert_pp_4x\h\()_neon
> >+    push            {r4, r5, r6}
> >+    ldr             r4, [sp, #4 * 3]
> >+    mov             r5, r4, lsl #6
> >+    mov             r4, #3
> >+    mul             r4, r1, r4
> How about below:
> mov r4, r1, lsl #2
> sub r4, r1
>
> Code are right, just performance problem, we may reference ffmpeg's hevcdsp_qpel_neon.S
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160315/4b16d822/attachment-0001.html>


More information about the x265-devel mailing list