[x265] [PATCH] arm: Implement blockcopy_ss_neon ARM NEON

Mon Feb 29 10:15:36 CET 2016

# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1456484036 -19800
#      Fri Feb 26 16:23:56 2016 +0530
# Node ID 3363b15c06e563cb119085facfd190757cae3945
# Parent  cec0be1e488fdae48c37e49b9d7cb3e52944917b
arm: Implement blockcopy_ss_neon ARM NEON

diff -r cec0be1e488f -r 3363b15c06e5 source/common/arm/asm-primitives.cpp

--- a/source/common/arm/asm-primitives.cpp	Fri Feb 26 15:30:16 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp	Fri Feb 26 16:23:56 2016 +0530
@@ -42,12 +42,12 @@
 {
     if (cpuMask & X265_CPU_NEON)
     {
-        // Blockcopy_sp
-        p.cu[BLOCK_4x4].copy_sp   = PFX(blockcopy_sp_4x4_neon);
-        p.cu[BLOCK_8x8].copy_sp   = PFX(blockcopy_sp_8x8_neon);
-        p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
-        p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
-        p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
+        // Blockcopy_ss
+        p.cu[BLOCK_4x4].copy_ss   = PFX(blockcopy_ss_4x4_neon);
+        p.cu[BLOCK_8x8].copy_ss   = PFX(blockcopy_ss_8x8_neon);
+        p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon);
+        p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
+        p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_neon);
 
         // Blockcopy_ps
         p.cu[BLOCK_4x4].copy_ps   = PFX(blockcopy_ps_4x4_neon);
@@ -56,6 +56,13 @@
         p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
         p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_neon);
 
+        // Blockcopy_sp
+        p.cu[BLOCK_4x4].copy_sp   = PFX(blockcopy_sp_4x4_neon);
+        p.cu[BLOCK_8x8].copy_sp   = PFX(blockcopy_sp_8x8_neon);
+        p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
+        p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
+        p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
+
         // pixel_add_ps
         p.cu[BLOCK_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
         p.cu[BLOCK_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
diff -r cec0be1e488f -r 3363b15c06e5 source/common/arm/blockcopy8.S
--- a/source/common/arm/blockcopy8.S	Fri Feb 26 15:30:16 2016 +0530
+++ b/source/common/arm/blockcopy8.S	Fri Feb 26 16:23:56 2016 +0530
@@ -233,3 +233,81 @@
     bne             loop_cps64
     bx              lr
 endfunc
+
+// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
+function x265_blockcopy_ss_4x4_neon
+    lsl             r1, #1
+    lsl             r3, #1
+.rept 2
+    vld1.u16        {d0}, [r2], r3
+    vld1.u16        {d1}, [r2], r3
+    vst1.u16        {d0}, [r0], r1
+    vst1.u16        {d1}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_ss_8x8_neon
+    lsl             r1, #1
+    lsl             r3, #1
+.rept 4
+    vld1.u16        {q0}, [r2], r3
+    vld1.u16        {q1}, [r2], r3
+    vst1.u16        {q0}, [r0], r1
+    vst1.u16        {q1}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_ss_16x16_neon
+    lsl             r1, #1
+    lsl             r3, #1
+.rept 8
+    vld1.u16        {q0, q1}, [r2], r3
+    vld1.u16        {q2, q3}, [r2], r3
+    vst1.u16        {q0, q1}, [r0], r1
+    vst1.u16        {q2, q3}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_ss_32x32_neon
+    lsl             r1, #1
+    lsl             r3, #1
+    mov             r12, #4
+    sub             r1, #32
+    sub             r3, #32
+loop_css32:
+    subs            r12, #1
+.rept 8
+    vld1.u16        {q0, q1}, [r2]!
+    vld1.u16        {q2, q3}, [r2], r3
+    vst1.u16        {q0, q1}, [r0]!
+    vst1.u16        {q2, q3}, [r0], r1
+.endr
+    bne             loop_css32
+    bx              lr
+endfunc
+
+function x265_blockcopy_ss_64x64_neon
+    lsl             r1, #1
+    lsl             r3, #1
+    mov             r12, #8
+    sub             r1, #96
+    sub             r3, #96
+loop_css64:
+    subs            r12, #1
+.rept 8
+    vld1.u16        {q0, q1}, [r2]!
+    vld1.u16        {q2, q3}, [r2]!
+    vld1.u16        {q8, q9}, [r2]!
+    vld1.u16        {q10, q11}, [r2], r3
+
+    vst1.u16        {q0, q1}, [r0]!
+    vst1.u16        {q2, q3}, [r0]!
+    vst1.u16        {q8, q9}, [r0]!
+    vst1.u16        {q10, q11}, [r0], r1
+.endr
+    bne             loop_css64
+    bx              lr
+endfunc
diff -r cec0be1e488f -r 3363b15c06e5 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h	Fri Feb 26 15:30:16 2016 +0530
+++ b/source/common/arm/blockcopy8.h	Fri Feb 26 16:23:56 2016 +0530
@@ -68,4 +68,10 @@
 void x265_blockcopy_ps_16x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
 void x265_blockcopy_ps_32x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
 void x265_blockcopy_ps_64x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+void x265_blockcopy_ss_4x4_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_8x8_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_16x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_32x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_64x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
 #endif // ifndef X265_I386_PIXEL_ARM_H