[x265] [PATCH] arm: Implement blockcopy_sp_neon ARM NEON

Mon Feb 29 10:09:08 CET 2016

# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1456478806 -19800
#      Fri Feb 26 14:56:46 2016 +0530
# Node ID e7fe951785981cfe16b85d96e0f179acd946eaa6
# Parent  3465dfa53f9fb5294a36f4112613913d95f5b481
arm: Implement blockcopy_sp_neon ARM NEON

diff -r 3465dfa53f9f -r e7fe95178598 source/common/CMakeLists.txt

--- a/source/common/CMakeLists.txt	Thu Feb 25 15:15:07 2016 +0530
+++ b/source/common/CMakeLists.txt	Fri Feb 26 14:56:46 2016 +0530
@@ -89,7 +89,7 @@
     set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
 
     # add ARM assembly/intrinsic files here
-    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S)
+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S)
     set(VEC_PRIMITIVES)
 
     set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
diff -r 3465dfa53f9f -r e7fe95178598 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp	Thu Feb 25 15:15:07 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp	Fri Feb 26 14:56:46 2016 +0530
@@ -42,6 +42,13 @@
 {
     if (cpuMask & X265_CPU_NEON)
     {
+        // Blockcopy_sp
+        p.cu[BLOCK_4x4].copy_sp   = PFX(blockcopy_sp_4x4_neon);
+        p.cu[BLOCK_8x8].copy_sp   = PFX(blockcopy_sp_8x8_neon);
+        p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
+        p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
+        p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
+
         // pixel_add_ps
         p.cu[BLOCK_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
         p.cu[BLOCK_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
diff -r 3465dfa53f9f -r e7fe95178598 source/common/arm/blockcopy8.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/arm/blockcopy8.S	Fri Feb 26 14:56:46 2016 +0530
@@ -0,0 +1,136 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
+ *
+ * r0   - a
+ * r1   - stridea
+ * r2   - b
+ * r3   - strideb */
+function x265_blockcopy_sp_4x4_neon
+    lsl             r3, #1
+.rept 2
+    vld1.u16        {q0}, [r2], r3
+    vld1.u16        {q1}, [r2], r3
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vst1.u32        {d0[0]}, [r0], r1
+    vst1.u32        {d1[0]}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_sp_8x8_neon
+    lsl             r3, #1
+.rept 4
+    vld1.u16        {q0}, [r2], r3
+    vld1.u16        {q1}, [r2], r3
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vst1.u8         {d0}, [r0], r1
+    vst1.u8         {d1}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_sp_16x16_neon
+    lsl             r3, #1
+.rept 8
+    vld1.u16        {q0, q1}, [r2], r3
+    vld1.u16        {q2, q3}, [r2], r3
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vmovn.u16       d2, q2
+    vmovn.u16       d3, q3
+    vst1.u8         {q0}, [r0], r1
+    vst1.u8         {q1}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_sp_32x32_neon
+    mov             r12, #4
+    lsl             r3, #1
+    sub             r3, #32
+loop_csp32:
+    subs            r12, #1
+.rept 4
+    vld1.u16        {q0, q1}, [r2]!
+    vld1.u16        {q2, q3}, [r2], r3
+    vld1.u16        {q8, q9}, [r2]!
+    vld1.u16        {q10, q11}, [r2], r3
+
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vmovn.u16       d2, q2
+    vmovn.u16       d3, q3
+
+    vmovn.u16       d4, q8
+    vmovn.u16       d5, q9
+    vmovn.u16       d6, q10
+    vmovn.u16       d7, q11
+
+    vst1.u8         {q0, q1}, [r0], r1
+    vst1.u8         {q2, q3}, [r0], r1
+.endr
+    bne             loop_csp32
+    bx              lr
+endfunc
+
+function x265_blockcopy_sp_64x64_neon
+    mov             r12, #16
+    lsl             r3, #1
+    sub             r3, #96
+    sub             r1, #32
+loop_csp64:
+    subs            r12, #1
+.rept 4
+    vld1.u16        {q0, q1}, [r2]!
+    vld1.u16        {q2, q3}, [r2]!
+    vld1.u16        {q8, q9}, [r2]!
+    vld1.u16        {q10, q11}, [r2], r3
+
+    vmovn.u16       d0, q0
+    vmovn.u16       d1, q1
+    vmovn.u16       d2, q2
+    vmovn.u16       d3, q3
+
+    vmovn.u16       d4, q8
+    vmovn.u16       d5, q9
+    vmovn.u16       d6, q10
+    vmovn.u16       d7, q11
+
+    vst1.u8         {q0, q1}, [r0]!
+    vst1.u8         {q2, q3}, [r0], r1
+.endr
+    bne             loop_csp64
+    bx              lr
+endfunc
diff -r 3465dfa53f9f -r e7fe95178598 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h	Thu Feb 25 15:15:07 2016 +0530
+++ b/source/common/arm/blockcopy8.h	Fri Feb 26 14:56:46 2016 +0530
@@ -57,4 +57,9 @@
 void x265_cpy2Dto1D_shr_16x16_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 void x265_cpy2Dto1D_shr_32x32_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 
+void x265_blockcopy_sp_4x4_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_8x8_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_16x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_32x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_sp_64x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
 #endif // ifndef X265_I386_PIXEL_ARM_H