[x265] [PATCH] arm: Implement blockcopy_sp_neon ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Mon Feb 29 10:09:08 CET 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1456478806 -19800
# Fri Feb 26 14:56:46 2016 +0530
# Node ID e7fe951785981cfe16b85d96e0f179acd946eaa6
# Parent 3465dfa53f9fb5294a36f4112613913d95f5b481
arm: Implement blockcopy_sp_neon ARM NEON
diff -r 3465dfa53f9f -r e7fe95178598 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Thu Feb 25 15:15:07 2016 +0530
+++ b/source/common/CMakeLists.txt Fri Feb 26 14:56:46 2016 +0530
@@ -89,7 +89,7 @@
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
# add ARM assembly/intrinsic files here
- set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S)
+ set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S)
set(VEC_PRIMITIVES)
set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
diff -r 3465dfa53f9f -r e7fe95178598 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Thu Feb 25 15:15:07 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Fri Feb 26 14:56:46 2016 +0530
@@ -42,6 +42,13 @@
{
if (cpuMask & X265_CPU_NEON)
{
+ // Blockcopy_sp
+ p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
+ p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
+ p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
+ p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
+ p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
+
// pixel_add_ps
p.cu[BLOCK_4x4].add_ps = PFX(pixel_add_ps_4x4_neon);
p.cu[BLOCK_8x8].add_ps = PFX(pixel_add_ps_8x8_neon);
diff -r 3465dfa53f9f -r e7fe95178598 source/common/arm/blockcopy8.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/arm/blockcopy8.S Fri Feb 26 14:56:46 2016 +0530
@@ -0,0 +1,136 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
+ *
+ * r0 - a
+ * r1 - stridea
+ * r2 - b
+ * r3 - strideb */
+function x265_blockcopy_sp_4x4_neon
+ lsl r3, #1
+.rept 2
+ vld1.u16 {q0}, [r2], r3
+ vld1.u16 {q1}, [r2], r3
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q1
+ vst1.u32 {d0[0]}, [r0], r1
+ vst1.u32 {d1[0]}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockcopy_sp_8x8_neon
+ lsl r3, #1
+.rept 4
+ vld1.u16 {q0}, [r2], r3
+ vld1.u16 {q1}, [r2], r3
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q1
+ vst1.u8 {d0}, [r0], r1
+ vst1.u8 {d1}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockcopy_sp_16x16_neon
+ lsl r3, #1
+.rept 8
+ vld1.u16 {q0, q1}, [r2], r3
+ vld1.u16 {q2, q3}, [r2], r3
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q1
+ vmovn.u16 d2, q2
+ vmovn.u16 d3, q3
+ vst1.u8 {q0}, [r0], r1
+ vst1.u8 {q1}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockcopy_sp_32x32_neon
+ mov r12, #4
+ lsl r3, #1
+ sub r3, #32
+loop_csp32:
+ subs r12, #1
+.rept 4
+ vld1.u16 {q0, q1}, [r2]!
+ vld1.u16 {q2, q3}, [r2], r3
+ vld1.u16 {q8, q9}, [r2]!
+ vld1.u16 {q10, q11}, [r2], r3
+
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q1
+ vmovn.u16 d2, q2
+ vmovn.u16 d3, q3
+
+ vmovn.u16 d4, q8
+ vmovn.u16 d5, q9
+ vmovn.u16 d6, q10
+ vmovn.u16 d7, q11
+
+ vst1.u8 {q0, q1}, [r0], r1
+ vst1.u8 {q2, q3}, [r0], r1
+.endr
+ bne loop_csp32
+ bx lr
+endfunc
+
+function x265_blockcopy_sp_64x64_neon
+ mov r12, #16
+ lsl r3, #1
+ sub r3, #96
+ sub r1, #32
+loop_csp64:
+ subs r12, #1
+.rept 4
+ vld1.u16 {q0, q1}, [r2]!
+ vld1.u16 {q2, q3}, [r2]!
+ vld1.u16 {q8, q9}, [r2]!
+ vld1.u16 {q10, q11}, [r2], r3
+
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q1
+ vmovn.u16 d2, q2
+ vmovn.u16 d3, q3
+
+ vmovn.u16 d4, q8
+ vmovn.u16 d5, q9
+ vmovn.u16 d6, q10
+ vmovn.u16 d7, q11
+
+ vst1.u8 {q0, q1}, [r0]!
+ vst1.u8 {q2, q3}, [r0], r1
+.endr
+ bne loop_csp64
+ bx lr
+endfunc
diff -r 3465dfa53f9f -r e7fe95178598 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h Thu Feb 25 15:15:07 2016 +0530
+++ b/source/common/arm/blockcopy8.h Fri Feb 26 14:56:46 2016 +0530
@@ -57,4 +57,9 @@
void x265_cpy2Dto1D_shr_16x16_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
void x265_cpy2Dto1D_shr_32x32_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_blockcopy_sp_4x4_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_8x8_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_16x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_32x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_64x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
#endif // ifndef X265_I386_PIXEL_ARM_H
More information about the x265-devel
mailing list