<div dir="ltr"><div>Thanks min,<br></div>I referred to ffmpeg's NEON version of the filter and tried something similar in design for 8x8 for a start. Right now, the filter works for coefficient index 0 and 1 only. Is this what you had in mind for better performance ? <br>---------------------------------------------------------------------------------------------------------------------------------------------------------------------<br>.macro qpel_filter_1<br>        vmov.i16        d16, #58<br>        vmovl.u8        q11, d3<br>        vmull.s16       q9, d22, d16   // 58 * d0<br>        vmull.s16       q10, d23, d16   // 58 * d1<br><br>        vmov.i16        d17, #10<br>        vmovl.u8        q13, d2<br>        vmull.s16       q11, d26, d17   // 10 * c0<br>        vmull.s16       q12, d27, d17   // 10 * c1<br><br>        vmov.i16        d16, #17<br>        vmovl.u8        q15, d4<br>        vmull.s16       q13, d30, d16   // 17 * e0<br>        vmull.s16       q14, d31, d16   // 17 * e1<br><br>        vmov.i16        d17, #5<br>        vmovl.u8        q1, d5<br>        vmull.s16       q15, d2, d17  //  5 * f0<br>        vmull.s16       q8, d3, d17  //  5 * f1<br><br>        vsub.s32        q9, q11       // 58 * d0 - 10 * c0<br>        vsub.s32        q10, q12       // 58 * d1 - 10 * c1<br><br>        vmovl.u8        q1, d1<br>        vshll.s16       q11, d2, #2    // 4 * b0<br>        vshll.s16       q12, d3, #2    // 4 * b1<br><br>        vadd.s32        q9, q13       // 58 * d0 - 10 * c0 + 17 * e0<br>        vadd.s32        q10, q14       // 58 * d1 - 10 * c1 + 17 * e1<br><br>        vmovl.u8        q1, d0<br>        vmovl.u8        q2, d6<br>        vsubl.s16       q13, d4, d2   // g0 - a0<br>        vsubl.s16       q14, d5, d3   // g1 - a1<br><br>        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0<br>        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1<br>        vsub.s32   q13, q15       // g0 - a0 - 5 * f0<br>        vsub.s32   q14, q8        // g1 - a1 - 5 * f1<br>        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0<br>        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1<br>.endm<br>.macro qpel_filter_2<br> .endm<br>.macro qpel_filter_3<br>.endm<br>.macro LUMA_VPP_8xN h<br>function x265_interp_8tap_vert_pp_8x\h\()_neon<br>    push            {r4, r5, r6, r7}<br>    ldr             r5, [sp, #4 * 4]<br>    mov             r4, #3<br>    mul             r4, r1, r4<br>    sub             r0, r4<br>    mov             r4, #\h<br>.loop_8x\h:<br>    mov             r6,r0<br>    pld [r6]<br>    vld1.u8         d0, [r6], r1<br>    pld [r6]<br>    vld1.u8         d1, [r6], r1<br>    pld [r6]<br>    vld1.u8         d2, [r6], r1<br>    pld [r6]<br>    vld1.u8         d3, [r6], r1<br>    pld [r6]<br>    vld1.u8         d4, [r6], r1<br>    pld [r6]<br>    vld1.u8         d5, [r6], r1<br>    pld [r6]<br>    vld1.u8         d6, [r6], r1<br>    pld [r6]<br>    vld1.u8         d7, [r6], r1<br><br>    veor.u8         q9, q9<br>    veor.u8         q10, q10<br><br>    cmp             r5,#0<br>    beq              0f<br>    cmp             r5,#1<br>    beq              1f<br>    cmp             r5,#2<br>    beq              2f<br>    cmp             r5,#3<br>    beq              3f<br>1:<br>    qpel_filter_1<br>2:<br>    qpel_filter_2<br>3:<br>    qpel_filter_3<br>0:<br>    vmov.i16         d17, #64<br>    vmovl.u8        q11, d3<br>    vmull.s16       q9, d22, d17   // 64*d0<br>    vmull.s16       q10, d23, d17   // 64*d1<br><br>    mov             r12,#32<br>    vdup.32         q8, r12<br>    vadd.s32        q9, q8<br>    vqshrun.s32     d0, q9, #6<br>    vadd.s32        q10, q8<br>    vqshrun.s32     d1, q10, #6<br>    vqmovn.u16      d0, q0<br>    vst1.u8         d0, [r2], r3<br><br>    add             r0, r1<br>    subs            r4, #1<br>    bne             .loop_8x\h<br><br>    pop             {r4, r5, r6, r7}<br>    bx              lr<br>endfunc<br>.endm<br><br>LUMA_VPP_8xN 8<br><br></div><div class="gmail_extra"><br clear="all"><div><div class="gmail_signature"><div dir="ltr"><div><div dir="ltr"><div><div><span style="color:rgb(56,118,29)"><br></span></div><div><span style="color:rgb(56,118,29)">Thank you<br></span></div><span style="color:rgb(56,118,29)">Regards<br></span></div><span style="color:rgb(56,118,29)">Ramya</span><br></div></div></div></div></div>
<br><div class="gmail_quote">On Mon, Mar 14, 2016 at 5:43 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><br><pre><div><div class="h5"><br>At 2016-03-14 19:09:19,<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a> wrote:
># HG changeset patch
># User Ramya Sriraman<<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a>>
># Date 1457681682 -19800
>#      Fri Mar 11 13:04:42 2016 +0530
># Node ID c1d2fa2ca49d4027252bd52176ecbe2db0d0eddd
># Parent  0af38750a71aab5fe790993365aaaa3e209a7d5c
>arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon
>
>diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp Thu Mar 10 21:43:35 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp Fri Mar 11 13:04:42 2016 +0530
>@@ -290,6 +290,14 @@
>         // planecopy
>         p.planecopy_cp = PFX(pixel_planecopy_cp_neon);

>+        // interpolation filters
>+        p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_neon);
>+        p.pu[LUMA_4x8].luma_vpp = PFX(interp_8tap_vert_pp_4x8_neon);
>+        p.pu[LUMA_4x16].luma_vpp = PFX(interp_8tap_vert_pp_4x16_neon);
>+        p.pu[LUMA_8x4].luma_vpp = PFX(interp_8tap_vert_pp_8x4_neon);
>+        p.pu[LUMA_8x8].luma_vpp = PFX(interp_8tap_vert_pp_8x8_neon);
>+        p.pu[LUMA_8x16].luma_vpp = PFX(interp_8tap_vert_pp_8x16_neon);
>+        p.pu[LUMA_8x32].luma_vpp = PFX(interp_8tap_vert_pp_8x32_neon);
>     }
>     if (cpuMask & X265_CPU_ARMV6)
>     {
>diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/ipfilter8.S
>--- a/source/common/arm/ipfilter8.S        Thu Mar 10 21:43:35 2016 +0530
>+++ b/source/common/arm/ipfilter8.S        Fri Mar 11 13:04:42 2016 +0530
>@@ -27,6 +27,8 @@

> .align 4

>+g_lumaFilter:
>+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0,-1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0,-1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1,0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1 
> .text

> // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
>@@ -692,3 +694,207 @@
>     bgt         .loop_filterP2S_48x64
>     bx          lr
> endfunc
>+
>+.macro LUMA_VPP_4xN h
>+function x265_interp_8tap_vert_pp_4x\h\()_neon
>+    push            {r4, r5, r6}
>+    ldr             r4, [sp, #4 * 3]
>+    mov             r5, r4, lsl #6
>+    mov             r4, #3
>+    mul             r4, r1, r4<br></div></div>How about below:<br>mov r4, r1, lsl #2<br>sub r4, r1<br>
<br>Code are right, just performance problem, we may reference ffmpeg's hevcdsp_qpel_neon.S
</pre></div><br>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></blockquote></div><br></div>