<div dir="ltr">On an average of 1.5x to 3x depending on the size.<br>This is the console output after running testbench for all the vertical interpolation filter patches. <br><br>Primitive--------------speedup------cpu cycles-----<br><br>luma_vpp[  4x4]        1.20x      2.88          3.45    <br>luma_vps[  4x4]        1.25x      2.82          3.52    <br>luma_vsp[  4x4]        1.29x      2.79          3.60   <br><br>luma_vpp[  8x8]        2.02x      3.67          7.42    <br>luma_vps[  8x8]        2.01x      3.69          7.42    <br>luma_vsp[  8x8]        2.05x      3.63          7.43    <br><br>luma_vpp[16x16]        2.70x      8.81          23.83   <br>luma_vps[16x16]        2.68x      8.81          23.57   <br>luma_vsp[16x16]        2.80x      8.53          23.87  <br><br>luma_vpp[32x32]        2.95x      28.97         85.51   <br>luma_vps[32x32]        2.94x      28.86         84.85   <br>luma_vsp[32x32]        3.11x      27.74         86.27   <br><br>luma_vpp[64x64]        3.01x      109.40        328.88  <br>luma_vps[64x64]        2.99x      110.63        330.28  <br>luma_vsp[64x64]        3.08x      108.43        333.58  <br><br>luma_vpp[  8x4]        1.76x      2.77          4.87    <br>luma_vps[  8x4]        1.71x      2.79          4.76    <br>luma_vsp[  8x4]        1.73x      2.76          4.77    <br>  <br>luma_vpp[  4x8]        1.19x      3.85          4.58    <br>luma_vps[  4x8]        1.34x      3.67          4.93    <br>luma_vsp[  4x8]        1.38x      3.59          4.95    <br>  <br>luma_vpp[ 16x8]        2.40x      5.40          12.95   <br>luma_vps[ 16x8]        2.38x      5.40          12.85   <br>luma_vsp[ 16x8]        2.43x      5.29          12.87   <br><br>luma_vpp[ 8x16]        2.33x      5.50          12.81   <br>luma_vps[ 8x16]        2.24x      5.70          12.79   <br>luma_vsp[ 8x16]        2.35x      5.46          12.84   <br> <br>luma_vpp[32x16]        2.82x      15.47         43.56   <br>luma_vps[32x16]        2.82x      15.44         43.46   <br>luma_vsp[32x16]        2.96x      14.90         44.10   <br>   <br>luma_vpp[16x32]        2.91x      15.67         45.63   <br>luma_vps[16x32]        2.92x      15.56         45.42   <br>luma_vsp[16x32]        3.00x      15.10         45.34   <br> <br>luma_vpp[64x32]        2.97x      55.51         165.10  <br>luma_vps[64x32]        2.96x      55.54         164.41  <br>luma_vsp[64x32]        3.15x      52.93         166.78  <br><br>luma_vpp[32x64]        3.02x      56.05         169.12  <br>luma_vps[32x64]        3.00x      56.30         168.71  <br>luma_vsp[32x64]        3.11x      55.16         171.32  <br><br>luma_vpp[16x12]        2.61x      7.08          18.51   <br>luma_vps[16x12]        2.58x      7.08          18.28   <br>luma_vsp[16x12]        2.68x      6.89          18.45   <br><br>luma_vpp[12x16]        2.05x      8.68          17.75   <br>luma_vps[12x16]        2.13x      8.39          17.86   <br>luma_vsp[12x16]        2.10x      8.49          17.86   <br><br>luma_vpp[ 16x4]        2.07x      3.62          7.49    <br>luma_vps[ 16x4]        2.05x      3.59          7.38    <br>luma_vsp[ 16x4]        2.14x      3.53          7.56    <br>  <br>luma_vpp[ 4x16]        1.35x      5.81          7.85    <br>luma_vps[ 4x16]        1.41x      5.50          7.78    <br>luma_vsp[ 4x16]        1.49x      5.32          7.90    <br>  <br>luma_vpp[32x24]        2.92x      22.19         64.76   <br>luma_vps[32x24]        2.91x      22.10         64.22   <br>luma_vsp[32x24]        3.05x      21.24         64.68   <br><br>luma_vpp[24x32]        2.94x      22.31         65.56   <br>luma_vps[24x32]        2.91x      22.36         65.07   <br>luma_vsp[24x32]        3.07x      21.51         66.02   <br>   <br>luma_vpp[ 32x8]        2.62x      8.75          22.87   <br>luma_vps[ 32x8]        2.63x      8.64          22.75   <br>luma_vsp[ 32x8]        2.74x      8.36          22.89   <br><br>luma_vpp[ 8x32]        2.59x      8.98          23.27   <br>luma_vps[ 8x32]        2.53x      9.34          23.60   <br>luma_vsp[ 8x32]        2.62x      8.97          23.54   <br><br>luma_vpp[64x48]        2.99x      82.53         247.04  <br>luma_vps[64x48]        2.99x      82.54         246.76  <br>luma_vsp[64x48]        3.12x      79.86         249.17  <br>  <br>luma_vpp[48x64]        3.01x      82.74         248.97  <br>luma_vps[48x64]        3.00x      82.95         248.84  <br>luma_vsp[48x64]        3.14x      80.15         251.90  <br> <br>luma_vpp[64x16]        2.91x      28.75         83.63   <br>luma_vps[64x16]        2.91x      28.77         83.57   <br>luma_vsp[64x16]        3.08x      27.31         84.02   <br>  <br>luma_vpp[16x64]        3.04x      29.37         89.35   <br>luma_vps[16x64]        3.04x      29.14         88.66   <br>luma_vsp[16x64]        3.17x      28.21         89.31   <br><br></div><div class="gmail_extra"><br clear="all"><div><div class="gmail_signature"><div dir="ltr"><div><div dir="ltr"><div><div><span style="color:rgb(56,118,29)"><br></span></div><div><span style="color:rgb(56,118,29)">Thank you<br></span></div><span style="color:rgb(56,118,29)">Regards<br></span></div><span style="color:rgb(56,118,29)">Ramya</span><br></div></div></div></div></div>
<br><div class="gmail_quote">On Tue, Mar 22, 2016 at 7:21 PM, Pradeep Ramachandran <span dir="ltr"><<a href="mailto:pradeep@multicorewareinc.com" target="_blank">pradeep@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr">What is the improvement in cycles that we see from testbench from this patch?</div><div class="gmail_extra"><br clear="all"><div><div><div dir="ltr"><div><div dir="ltr"><div><div dir="ltr"><div dir="ltr"><div dir="ltr"><div dir="ltr"><div dir="ltr">Pradeep Ramachandran, PhD<div>Solution Architect at <a href="http://www.multicorewareinc.com/" style="font-size:12.8px" target="_blank">www.multicorewareinc.com/</a></div><div>Visiting Professor at<a href="http://www.cse.iitm.ac.in" target="_blank"> www.cse.iitm.ac.in</a>/</div><div><a href="http://pradeeprama.info/" style="font-size:12.8px" target="_blank">pradeeprama.info/</a><br></div><div><span style="font-size:12.8px">Ph:   +91 99627 82018</span><br></div></div></div></div></div></div></div></div></div></div></div></div>
<br><div class="gmail_quote"><div><div class="h5">On Tue, Mar 22, 2016 at 6:57 PM,  <span dir="ltr"><<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a>></span> wrote:<br></div></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div><div class="h5"># HG changeset patch<br>
# User Ramya Sriraman<<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a>><br>
# Date 1458652316 -19800<br>
#      Tue Mar 22 18:41:56 2016 +0530<br>
# Node ID fd95ed60b242adffbeb0991609271c8a15040ff9<br>
# Parent  a9014e51d47ee5cdfe381d02526b1c94082cd4bf<br>
arm: Implement interp_8tap_vert_ps_NxN NEON<br>
<br>
diff -r a9014e51d47e -r fd95ed60b242 source/common/arm/asm-primitives.cpp<br>
--- a/source/common/arm/asm-primitives.cpp      Tue Mar 22 11:10:43 2016 +0530<br>
+++ b/source/common/arm/asm-primitives.cpp      Tue Mar 22 18:41:56 2016 +0530<br>
@@ -354,6 +354,32 @@<br>
         p.pu[LUMA_24x32].luma_vsp   = PFX(interp_8tap_vert_sp_24x32_neon);<br>
         p.pu[LUMA_48x64].luma_vsp   = PFX(interp_8tap_vert_sp_48x64_neon);<br>
         p.pu[LUMA_12x16].luma_vsp   = PFX(interp_8tap_vert_sp_12x16_neon);<br>
+<br>
+        p.pu[LUMA_4x4].luma_vps     = PFX(interp_8tap_vert_ps_4x4_neon);<br>
+        p.pu[LUMA_4x8].luma_vps     = PFX(interp_8tap_vert_ps_4x8_neon);<br>
+        p.pu[LUMA_4x16].luma_vps    = PFX(interp_8tap_vert_ps_4x16_neon);<br>
+        p.pu[LUMA_8x4].luma_vps     = PFX(interp_8tap_vert_ps_8x4_neon);<br>
+        p.pu[LUMA_8x8].luma_vps     = PFX(interp_8tap_vert_ps_8x8_neon);<br>
+        p.pu[LUMA_8x16].luma_vps    = PFX(interp_8tap_vert_ps_8x16_neon);<br>
+        p.pu[LUMA_8x32].luma_vps    = PFX(interp_8tap_vert_ps_8x32_neon);<br>
+        p.pu[LUMA_16x4].luma_vps    = PFX(interp_8tap_vert_ps_16x4_neon);<br>
+        p.pu[LUMA_16x8].luma_vps    = PFX(interp_8tap_vert_ps_16x8_neon);<br>
+        p.pu[LUMA_16x16].luma_vps   = PFX(interp_8tap_vert_ps_16x16_neon);<br>
+        p.pu[LUMA_16x32].luma_vps   = PFX(interp_8tap_vert_ps_16x32_neon);<br>
+        p.pu[LUMA_16x64].luma_vps   = PFX(interp_8tap_vert_ps_16x64_neon);<br>
+        p.pu[LUMA_16x12].luma_vps   = PFX(interp_8tap_vert_ps_16x12_neon);<br>
+        p.pu[LUMA_32x8].luma_vps    = PFX(interp_8tap_vert_ps_32x8_neon);<br>
+        p.pu[LUMA_32x16].luma_vps   = PFX(interp_8tap_vert_ps_32x16_neon);<br>
+        p.pu[LUMA_32x32].luma_vps   = PFX(interp_8tap_vert_ps_32x32_neon);<br>
+        p.pu[LUMA_32x64].luma_vps   = PFX(interp_8tap_vert_ps_32x64_neon);<br>
+        p.pu[LUMA_32x24].luma_vps   = PFX(interp_8tap_vert_ps_32x24_neon);<br>
+        p.pu[LUMA_64x16].luma_vps   = PFX(interp_8tap_vert_ps_64x16_neon);<br>
+        p.pu[LUMA_64x32].luma_vps   = PFX(interp_8tap_vert_ps_64x32_neon);<br>
+        p.pu[LUMA_64x64].luma_vps   = PFX(interp_8tap_vert_ps_64x64_neon);<br>
+        p.pu[LUMA_64x48].luma_vps   = PFX(interp_8tap_vert_ps_64x48_neon);<br>
+        p.pu[LUMA_24x32].luma_vps   = PFX(interp_8tap_vert_ps_24x32_neon);<br>
+        p.pu[LUMA_48x64].luma_vps   = PFX(interp_8tap_vert_ps_48x64_neon);<br>
+        p.pu[LUMA_12x16].luma_vps   = PFX(interp_8tap_vert_ps_12x16_neon);<br>
     }<br>
     if (cpuMask & X265_CPU_ARMV6)<br>
     {<br>
diff -r a9014e51d47e -r fd95ed60b242 source/common/arm/ipfilter8.S<br>
--- a/source/common/arm/ipfilter8.S     Tue Mar 22 11:10:43 2016 +0530<br>
+++ b/source/common/arm/ipfilter8.S     Tue Mar 22 18:41:56 2016 +0530<br>
@@ -698,7 +698,7 @@<br>
     bgt         .loop_filterP2S_48x64<br>
     bx          lr<br>
 endfunc<br>
-<br>
+//**************luma_vpp************<br>
 .macro LUMA_VPP_4xN h<br>
 function x265_interp_8tap_vert_pp_4x\h\()_neon<br>
     push           {r4, r5, r6}<br>
@@ -1606,4 +1606,333 @@<br>
     pop             {r4, r5, r6, r7}<br>
     bx              lr<br>
 endfunc<br>
+//**************luma_vps*****************<br>
+.macro LUMA_VPS_4xN h<br>
+function x265_interp_8tap_vert_ps_4x\h\()_neon<br>
+    push           {r4, r5, r6}<br>
+    ldr             r4, [sp, #4 * 3]<br>
+    lsl             r3, #1<br>
+    mov             r5, r4, lsl #6<br>
+    mov             r4, r1, lsl #2<br>
+    sub             r4, r1<br>
+    sub             r0, r4<br>
<br>
+    mov             r4, #8192<br>
+    vdup.32         q8, r4<br>
+    mov             r4, #\h<br>
+<br>
+.loop_vps_4x\h:<br>
+    movrel          r12, g_lumaFilter<br>
+    add             r12, r5<br>
+    mov             r6, r0<br>
+<br>
+    pld [r6]<br>
+    vld1.u32        d0[0], [r6], r1<br>
+    pld [r6]<br>
+    vld1.u32        d0[1], [r6], r1<br>
+    pld [r6]<br>
+    vld1.u32        d1[0], [r6], r1<br>
+    pld [r6]<br>
+    vld1.u32        d1[1], [r6], r1<br>
+    pld [r6]<br>
+    vld1.u32        d2[0], [r6], r1<br>
+    pld [r6]<br>
+    vld1.u32        d2[1], [r6], r1<br>
+    pld [r6]<br>
+    vld1.u32        d3[0], [r6], r1<br>
+    pld [r6]<br>
+    vld1.u32        d3[1], [r6], r1<br>
+<br>
+    veor.u8         q9, q9<br>
+<br>
+    vmovl.u8        q11, d0<br>
+    vmovl.u16       q12, d22<br>
+    vmovl.u16       q13, d23<br>
+    vld1.s32        d20, [r12]!<br>
+    vmov.s32        d21, d20<br>
+    vmla.s32        q9, q12, q10<br>
+    vld1.s32        d20, [r12]!<br>
+    vmov.s32        d21, d20<br>
+    vmla.s32        q9, q13, q10<br>
+<br>
+    vmovl.u8        q11, d1<br>
+    vmovl.u16       q12, d22<br>
+    vmovl.u16       q13, d23<br>
+    vld1.s32        d20, [r12]!<br>
+    vmov.s32        d21, d20<br>
+    vmla.s32        q9, q12, q10<br>
+    vld1.s32        d20, [r12]!<br>
+    vmov.s32        d21, d20<br>
+    vmla.s32        q9, q13, q10<br>
+<br>
+    vmovl.u8        q11, d2<br>
+    vmovl.u16       q12, d22<br>
+    vmovl.u16       q13, d23<br>
+    vld1.s32        d20, [r12]!<br>
+    vmov.s32        d21, d20<br>
+    vmla.s32        q9, q12, q10<br>
+    vld1.s32        d20, [r12]!<br>
+    vmov.s32        d21, d20<br>
+    vmla.s32        q9, q13, q10<br>
+<br>
+    vmovl.u8        q11, d3<br>
+    vmovl.u16       q12, d22<br>
+    vmovl.u16       q13, d23<br>
+    vld1.s32        d20, [r12]!<br>
+    vmov.s32        d21, d20<br>
+    vmla.s32        q9, q12, q10<br>
+    vld1.s32        d20, [r12]!<br>
+    vmov.s32        d21, d20<br>
+    vmla.s32        q9, q13, q10<br>
+<br>
+    vsub.s32        q9, q8<br>
+    vqmovn.s32      d0, q9<br>
+    vst1.u16        d0, [r2], r3<br>
+<br>
+    add             r0, r1<br>
+    subs            r4, #1<br>
+    bne             .loop_vps_4x\h<br>
+<br>
+    pop             {r4, r5, r6}<br>
+    bx              lr<br>
+    .ltorg<br>
+endfunc<br>
+.endm<br>
+<br>
+LUMA_VPS_4xN 4<br>
+LUMA_VPS_4xN 8<br>
+LUMA_VPS_4xN 16<br>
+<br>
+<br>
+.macro FILTER_VPS a b filterv<br>
+<br>
+.loop_ps_\filterv\()_\a\()x\b:<br>
+<br>
+    mov             r7, r2<br>
+    mov             r6, r0<br>
+    eor             r8, r8<br>
+<br>
+.loop_ps_w8_\filterv\()_\a\()x\b:<br>
+<br>
+    add             r6, r0, r8<br>
+<br>
+    pld [r6]<br>
+    vld1.u8         d0, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d1, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d2, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d3, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d4, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d5, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d6, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d7, [r6], r1<br>
+<br>
+    veor.u8         q9, q9<br>
+    veor.u8         q10, q10<br>
+<br>
+   \filterv<br>
+<br>
+    mov             r12,#8192<br>
+    vdup.32         q8, r12<br>
+    vsub.s32        q9, q8<br>
+    vqmovn.s32      d0, q9<br>
+    vsub.s32        q10, q8<br>
+    vqmovn.s32      d1, q10<br>
+    vst1.u16         {q0}, [r7]!<br>
+<br>
+    add             r8, #8<br>
+    cmp             r8, #\a<br>
+    blt             .loop_ps_w8_\filterv\()_\a\()x\b<br>
+<br>
+    add             r0, r1<br>
+    add             r2, r3<br>
+    subs            r4, #1<br>
+    bne             .loop_ps_\filterv\()_\a\()x\b<br>
+<br>
+.endm<br>
+<br>
+.macro LUMA_VPS  w h<br>
+function x265_interp_8tap_vert_ps_\w\()x\h\()_neon<br>
+<br>
+    push            {r4, r5, r6, r7, r8}<br>
+    ldr             r5, [sp, #4 * 5]<br>
+    lsl             r3, #1<br>
+    mov             r4, r1, lsl #2<br>
+    sub             r4, r1<br>
+    sub             r0, r4<br>
+    mov             r4, #\h<br>
+<br>
+    cmp             r5, #0<br>
+    beq              0f<br>
+    cmp             r5, #1<br>
+    beq              1f<br>
+    cmp             r5, #2<br>
+    beq              2f<br>
+    cmp             r5, #3<br>
+    beq              3f<br>
+0:<br>
+    FILTER_VPS  \w \h qpel_filter_0_32b<br>
+    b            5f<br>
+1:<br>
+    FILTER_VPS  \w \h qpel_filter_1_32b<br>
+    b            5f<br>
+2:<br>
+    FILTER_VPS  \w \h qpel_filter_2_32b<br>
+    b            5f<br>
+3:<br>
+    FILTER_VPS  \w \h qpel_filter_3_32b<br>
+    b            5f<br>
+5:<br>
+    pop             {r4, r5, r6, r7, r8}<br>
+    bx              lr<br>
+endfunc<br>
+.endm<br>
+<br>
+LUMA_VPS 8 4<br>
+LUMA_VPS 8 8<br>
+LUMA_VPS 8 16<br>
+LUMA_VPS 8 32<br>
+LUMA_VPS 16 4<br>
+LUMA_VPS 16 8<br>
+LUMA_VPS 16 16<br>
+LUMA_VPS 16 32<br>
+LUMA_VPS 16 64<br>
+LUMA_VPS 16 12<br>
+LUMA_VPS 32 8<br>
+LUMA_VPS 32 16<br>
+LUMA_VPS 32 32<br>
+LUMA_VPS 32 64<br>
+LUMA_VPS 32 24<br>
+LUMA_VPS 64 16<br>
+LUMA_VPS 64 32<br>
+LUMA_VPS 64 64<br>
+LUMA_VPS 64 48<br>
+LUMA_VPS 24 32<br>
+LUMA_VPS 48 64<br>
+<br>
+function x265_interp_8tap_vert_ps_12x16_neon<br>
+    push            {r4, r5, r6, r7}<br>
+    lsl             r3, #1<br>
+    ldr             r5, [sp, #4 * 4]<br>
+    mov             r4, r1, lsl #2<br>
+    sub             r4, r1<br>
+    sub             r0, r4<br>
+<br>
+    mov             r4, #16<br>
+.loop_vps_12x16:<br>
+<br>
+    mov             r6, r0<br>
+    mov             r7, r2<br>
+<br>
+    pld [r6]<br>
+    vld1.u8         d0, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d1, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d2, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d3, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d4, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d5, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d6, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d7, [r6], r1<br>
+<br>
+    veor.u8         q9, q9<br>
+    veor.u8         q10, q10<br>
+<br>
+    cmp             r5,#0<br>
+    beq              0f<br>
+    cmp             r5,#1<br>
+    beq              1f<br>
+    cmp             r5,#2<br>
+    beq              2f<br>
+    cmp             r5,#3<br>
+    beq              3f<br>
+0:<br>
+    qpel_filter_0_32b<br>
+    b            5f<br>
+1:<br>
+    qpel_filter_1_32b<br>
+    b            5f<br>
+2:<br>
+    qpel_filter_2_32b<br>
+    b            5f<br>
+3:<br>
+    qpel_filter_3_32b<br>
+    b            5f<br>
+5:<br>
+    mov             r12,#8192<br>
+    vdup.32         q8, r12<br>
+    vsub.s32        q9, q8<br>
+    vqmovn.s32      d0, q9<br>
+    vsub.s32        q10, q8<br>
+    vqmovn.s32      d1, q10<br>
+    vst1.u8         {q0}, [r7]!<br>
+<br>
+    add             r6, r0, #8<br>
+<br>
+    pld [r6]<br>
+    vld1.u8         d0, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d1, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d2, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d3, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d4, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d5, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d6, [r6], r1<br>
+    pld [r6]<br>
+    vld1.u8         d7, [r6], r1<br>
+<br>
+    veor.u8         q9, q9<br>
+    veor.u8         q10, q10<br>
+<br>
+    cmp             r5,#0<br>
+    beq              0f<br>
+    cmp             r5,#1<br>
+    beq              1f<br>
+    cmp             r5,#2<br>
+    beq              2f<br>
+    cmp             r5,#3<br>
+    beq              3f<br>
+0:<br>
+    qpel_filter_0_32b<br>
+    b            5f<br>
+1:<br>
+    qpel_filter_1_32b<br>
+    b            5f<br>
+2:<br>
+    qpel_filter_2_32b<br>
+    b            5f<br>
+3:<br>
+    qpel_filter_3_32b<br>
+    b            5f<br>
+5:<br>
+    mov             r12,#8192<br>
+    vdup.32         q8, r12<br>
+    vsub.s32        q9, q8<br>
+    vqmovn.s32      d0, q9<br>
+    vst1.u8         d0, [r7]!<br>
+<br>
+    add             r0, r1<br>
+    add             r2, r3<br>
+    subs            r4, #1<br>
+    bne             .loop_vps_12x16<br>
+<br>
+    pop             {r4, r5, r6, r7}<br>
+    bx              lr<br>
+endfunc<br>
diff -r a9014e51d47e -r fd95ed60b242 source/common/arm/ipfilter8.h<br>
--- a/source/common/arm/ipfilter8.h     Tue Mar 22 11:10:43 2016 +0530<br>
+++ b/source/common/arm/ipfilter8.h     Tue Mar 22 18:41:56 2016 +0530<br>
@@ -102,4 +102,30 @@<br>
 void x265_interp_8tap_vert_sp_24x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);<br>
 void x265_interp_8tap_vert_sp_48x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);<br>
 void x265_interp_8tap_vert_sp_12x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);<br>
+<br>
+void x265_interp_8tap_vert_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
+void x265_interp_8tap_vert_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);<br>
 #endif // ifndef X265_IPFILTER8_ARM_H<br></div></div>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>
<br>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></blockquote></div><br></div>