<div dir="ltr">This patch doesnt apply.</div><div class="gmail_extra"><br><div class="gmail_quote">On Tue, Mar 1, 2016 at 5:46 PM, <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Dnyaneshwar G <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1456831820 -19800<br>
# Tue Mar 01 17:00:20 2016 +0530<br>
# Node ID 61e51faf9e7ee1c8056ac2f66cf51da104bfa106<br>
# Parent 79c00b9bc2b81afef2e41526fc3c390528f3174c<br>
arm: Implement filterPixelToShort ARM NEON asm<br>
<br>
diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/CMakeLists.txt<br>
--- a/source/common/CMakeLists.txt Tue Mar 01 12:18:18 2016 +0530<br>
+++ b/source/common/CMakeLists.txt Tue Mar 01 17:00:20 2016 +0530<br>
@@ -89,7 +89,7 @@<br>
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)<br>
<br>
# add ARM assembly/intrinsic files here<br>
- set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S)<br>
+ set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S)<br>
set(VEC_PRIMITIVES)<br>
<br>
set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")<br>
diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/asm-primitives.cpp<br>
--- a/source/common/arm/asm-primitives.cpp Tue Mar 01 12:18:18 2016 +0530<br>
+++ b/source/common/arm/asm-primitives.cpp Tue Mar 01 17:00:20 2016 +0530<br>
@@ -33,6 +33,7 @@<br>
#include "blockcopy8.h"<br>
#include "pixel.h"<br>
#include "pixel-util.h"<br>
+#include "ipfilter8.h"<br>
}<br>
<br>
namespace X265_NS {<br>
@@ -42,6 +43,33 @@<br>
{<br>
if (cpuMask & X265_CPU_NEON)<br>
{<br>
+ // filterPixelToShort<br>
+ p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_neon);<br>
+ p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_neon);<br>
+ p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_neon);<br>
+ p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_neon);<br>
+ p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_neon);<br>
+ p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_neon);<br>
+ p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_neon);<br>
+ p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);<br>
+ p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_neon);<br>
+ p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_neon);<br>
+ p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);<br>
+ p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);<br>
+ p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);<br>
+ p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);<br>
+ p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);<br>
+ p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_neon);<br>
+ p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);<br>
+ p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);<br>
+ p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);<br>
+ p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);<br>
+ p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);<br>
+ p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);<br>
+ p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);<br>
+ p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);<br>
+ p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);<br>
+<br>
// Block_fill<br>
<a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_4x4].blockfill_s = PFX(blockfill_s_4x4_neon);<br>
<a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_8x8].blockfill_s = PFX(blockfill_s_8x8_neon);<br>
diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.S<br>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000<br>
+++ b/source/common/arm/ipfilter8.S Tue Mar 01 17:00:20 2016 +0530<br>
@@ -0,0 +1,694 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2016 x265 project<br>
+ *<br>
+ * Authors: Dnyaneshwar G <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "asm.S"<br>
+<br>
+.section .rodata<br>
+<br>
+.align 4<br>
+<br>
+.text<br>
+<br>
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)<br>
+function x265_filterPixelToShort_4x4_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 2<br>
+ vld1.u8 {d0}, [r0], r1<br>
+ vld1.u8 {d2}, [r0], r1<br>
+ vmovl.u8 q0, d0<br>
+ vmovl.u8 q1, d2<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {d4}, [r2], r3<br>
+ vst1.16 {d6}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_4x8_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 4<br>
+ vld1.u8 {d0}, [r0], r1<br>
+ vld1.u8 {d2}, [r0], r1<br>
+ vmovl.u8 q0, d0<br>
+ vmovl.u8 q1, d2<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {d4}, [r2], r3<br>
+ vst1.16 {d6}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_4x16_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 8<br>
+ vld1.u8 {d0}, [r0], r1<br>
+ vld1.u8 {d2}, [r0], r1<br>
+ vmovl.u8 q0, d0<br>
+ vmovl.u8 q1, d2<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {d4}, [r2], r3<br>
+ vst1.16 {d6}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_8x4_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 2<br>
+ vld1.u8 {d0}, [r0], r1<br>
+ vld1.u8 {d2}, [r0], r1<br>
+ vmovl.u8 q0, d0<br>
+ vmovl.u8 q1, d2<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2}, [r2], r3<br>
+ vst1.16 {q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_8x8_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 4<br>
+ vld1.u8 {d0}, [r0], r1<br>
+ vld1.u8 {d2}, [r0], r1<br>
+ vmovl.u8 q0, d0<br>
+ vmovl.u8 q1, d2<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2}, [r2], r3<br>
+ vst1.16 {q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_8x16_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 8<br>
+ vld1.u8 {d0}, [r0], r1<br>
+ vld1.u8 {d2}, [r0], r1<br>
+ vmovl.u8 q0, d0<br>
+ vmovl.u8 q1, d2<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2}, [r2], r3<br>
+ vst1.16 {q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_8x32_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 16<br>
+ vld1.u8 {d0}, [r0], r1<br>
+ vld1.u8 {d2}, [r0], r1<br>
+ vmovl.u8 q0, d0<br>
+ vmovl.u8 q1, d2<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2}, [r2], r3<br>
+ vst1.16 {q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_12x16_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 16<br>
+ vld1.u8 {d2-d3}, [r0], r1<br>
+ vmovl.u8 q0, d2<br>
+ vmovl.u8 q1, d3<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {d4, d5, d6}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_16x4_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 4<br>
+ vld1.u8 {d2-d3}, [r0], r1<br>
+ vmovl.u8 q0, d2<br>
+ vmovl.u8 q1, d3<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_16x8_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 8<br>
+ vld1.u8 {d2-d3}, [r0], r1<br>
+ vmovl.u8 q0, d2<br>
+ vmovl.u8 q1, d3<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_16x12_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 12<br>
+ vld1.u8 {d2-d3}, [r0], r1<br>
+ vmovl.u8 q0, d2<br>
+ vmovl.u8 q1, d3<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_16x16_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 16<br>
+ vld1.u8 {d2-d3}, [r0], r1<br>
+ vmovl.u8 q0, d2<br>
+ vmovl.u8 q1, d3<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_16x32_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 32<br>
+ vld1.u8 {d2-d3}, [r0], r1<br>
+ vmovl.u8 q0, d2<br>
+ vmovl.u8 q1, d3<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_16x64_neon<br>
+ add r3, r3<br>
+ vmov.u16 q8, #64<br>
+ vmov.u16 q9, #8192<br>
+ vneg.s16 q9, q9<br>
+.rept 64<br>
+ vld1.u8 {d2-d3}, [r0], r1<br>
+ vmovl.u8 q0, d2<br>
+ vmovl.u8 q1, d3<br>
+ vmov q2, q9<br>
+ vmov q3, q9<br>
+ vmla.s16 q2, q0, q8<br>
+ vmla.s16 q3, q1, q8<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_24x32_neon<br>
+ add r3, r3<br>
+ sub r3, #32<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+.rept 32<br>
+ vld1.u8 {d18, d19, d20}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vst1.16 {q2}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_32x8_neon<br>
+ add r3, r3<br>
+ sub r3, #32<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+.rept 8<br>
+ vld1.u8 {q9-q10}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_32x16_neon<br>
+ add r3, r3<br>
+ sub r3, #32<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+ mov r12, #8<br>
+.loop_filterP2S_32x16:<br>
+ subs r12, #1<br>
+.rept 2<br>
+ vld1.u8 {q9-q10}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bgt .loop_filterP2S_32x16<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_32x24_neon<br>
+ add r3, r3<br>
+ sub r3, #32<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+ mov r12, #12<br>
+.loop_filterP2S_32x24:<br>
+ subs r12, #1<br>
+.rept 2<br>
+ vld1.u8 {q9-q10}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bgt .loop_filterP2S_32x24<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_32x32_neon<br>
+ add r3, r3<br>
+ sub r3, #32<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+ mov r12, #16<br>
+.loop_filterP2S_32x32:<br>
+ subs r12, #1<br>
+.rept 2<br>
+ vld1.u8 {q9-q10}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bgt .loop_filterP2S_32x32<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_32x64_neon<br>
+ add r3, r3<br>
+ sub r3, #32<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+ mov r12, #32<br>
+.loop_filterP2S_32x64:<br>
+ subs r12, #1<br>
+.rept 2<br>
+ vld1.u8 {q9-q10}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bgt .loop_filterP2S_32x64<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_64x16_neon<br>
+ add r3, r3<br>
+ sub r1, #32<br>
+ sub r3, #96<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+ mov r12, #8<br>
+.loop_filterP2S_64x16:<br>
+ subs r12, #1<br>
+.rept 2<br>
+ vld1.u8 {q9-q10}, [r0]!<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+<br>
+ vld1.u8 {q9-q10}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bgt .loop_filterP2S_64x16<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_64x32_neon<br>
+ add r3, r3<br>
+ sub r1, #32<br>
+ sub r3, #96<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+ mov r12, #16<br>
+.loop_filterP2S_64x32:<br>
+ subs r12, #1<br>
+.rept 2<br>
+ vld1.u8 {q9-q10}, [r0]!<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+<br>
+ vld1.u8 {q9-q10}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bgt .loop_filterP2S_64x32<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_64x48_neon<br>
+ add r3, r3<br>
+ sub r1, #32<br>
+ sub r3, #96<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+ mov r12, #24<br>
+.loop_filterP2S_64x48:<br>
+ subs r12, #1<br>
+.rept 2<br>
+ vld1.u8 {q9-q10}, [r0]!<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+<br>
+ vld1.u8 {q9-q10}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bgt .loop_filterP2S_64x48<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_64x64_neon<br>
+ add r3, r3<br>
+ sub r1, #32<br>
+ sub r3, #96<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+ mov r12, #32<br>
+.loop_filterP2S_64x64:<br>
+ subs r12, #1<br>
+.rept 2<br>
+ vld1.u8 {q9-q10}, [r0]!<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+<br>
+ vld1.u8 {q9-q10}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bgt .loop_filterP2S_64x64<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_filterPixelToShort_48x64_neon<br>
+ add r3, r3<br>
+ sub r1, #32<br>
+ sub r3, #64<br>
+ vmov.u16 q0, #64<br>
+ vmov.u16 q1, #8192<br>
+ vneg.s16 q1, q1<br>
+ mov r12, #32<br>
+.loop_filterP2S_48x64:<br>
+ subs r12, #1<br>
+.rept 2<br>
+ vld1.u8 {q9-q10}, [r0]!<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmovl.u8 q11, d20<br>
+ vmovl.u8 q10, d21<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q11, q0<br>
+ vmla.s16 q3, q10, q0<br>
+ vst1.16 {q2-q3}, [r2]!<br>
+<br>
+ vld1.u8 {q9}, [r0], r1<br>
+ vmovl.u8 q8, d18<br>
+ vmovl.u8 q9, d19<br>
+ vmov q2, q1<br>
+ vmov q3, q1<br>
+ vmla.s16 q2, q8, q0<br>
+ vmla.s16 q3, q9, q0<br>
+ vst1.16 {q2-q3}, [r2], r3<br>
+.endr<br>
+ bgt .loop_filterP2S_48x64<br>
+ bx lr<br>
+endfunc<br>
diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.h<br>
--- a/source/common/arm/ipfilter8.h Tue Mar 01 12:18:18 2016 +0530<br>
+++ b/source/common/arm/ipfilter8.h Tue Mar 01 17:00:20 2016 +0530<br>
@@ -25,4 +25,30 @@<br>
#ifndef X265_IPFILTER8_ARM_H<br>
#define X265_IPFILTER8_ARM_H<br>
<br>
+void x265_filterPixelToShort_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+void x265_filterPixelToShort_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);<br>
+<br>
#endif // ifndef X265_IPFILTER8_ARM_H<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br><div class="gmail_signature"><div dir="ltr"><div><div>Deepthi Nandakumar<br></div>Engineering Manager, x265<br></div>Multicoreware, Inc<br></div></div>
</div>