[x265] [PATCH] arm: Implement blockcopy_ps_neon ARM NEON

radhakrishnan at multicorewareinc.com radhakrishnan at multicorewareinc.com
Mon Feb 29 10:10:40 CET 2016


# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1456480816 -19800
#      Fri Feb 26 15:30:16 2016 +0530
# Node ID cec0be1e488fdae48c37e49b9d7cb3e52944917b
# Parent  e7fe951785981cfe16b85d96e0f179acd946eaa6
arm: Implement blockcopy_ps_neon ARM NEON

diff -r e7fe95178598 -r cec0be1e488f source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp	Fri Feb 26 14:56:46 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp	Fri Feb 26 15:30:16 2016 +0530
@@ -49,6 +49,13 @@
         p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
         p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
 
+        // Blockcopy_ps
+        p.cu[BLOCK_4x4].copy_ps   = PFX(blockcopy_ps_4x4_neon);
+        p.cu[BLOCK_8x8].copy_ps   = PFX(blockcopy_ps_8x8_neon);
+        p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_neon);
+        p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
+        p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_neon);
+
         // pixel_add_ps
         p.cu[BLOCK_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
         p.cu[BLOCK_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
diff -r e7fe95178598 -r cec0be1e488f source/common/arm/blockcopy8.S
--- a/source/common/arm/blockcopy8.S	Fri Feb 26 14:56:46 2016 +0530
+++ b/source/common/arm/blockcopy8.S	Fri Feb 26 15:30:16 2016 +0530
@@ -134,3 +134,102 @@
     bne             loop_csp64
     bx              lr
 endfunc
+
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
+function x265_blockcopy_ps_4x4_neon
+    lsl             r1, #1
+.rept 2
+    vld1.u8         {d0}, [r2], r3
+    vld1.u8         {d1}, [r2], r3
+    vmovl.u8        q1, d0
+    vmovl.u8        q2, d1
+    vst1.u16        {d2}, [r0], r1
+    vst1.u16        {d4}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_ps_8x8_neon
+    lsl             r1, #1
+.rept 4
+    vld1.u8         {d0}, [r2], r3
+    vld1.u8         {d1}, [r2], r3
+    vmovl.u8        q1, d0
+    vmovl.u8        q2, d1
+    vst1.u16        {q1}, [r0], r1
+    vst1.u16        {q2}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_ps_16x16_neon
+    lsl             r1, #1
+.rept 8
+    vld1.u8         {q0}, [r2], r3
+    vld1.u8         {q1}, [r2], r3
+    vmovl.u8        q8, d0
+    vmovl.u8        q9, d1
+    vmovl.u8        q10, d2
+    vmovl.u8        q11, d3
+    vst1.u16        {q8, q9}, [r0], r1
+    vst1.u16        {q10, q11}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockcopy_ps_32x32_neon
+    lsl             r1, #1
+    sub             r1, #32
+    mov             r12, #4
+loop_cps32:
+    subs            r12, #1
+.rept 4
+    vld1.u8         {q0, q1}, [r2], r3
+    vld1.u8         {q2, q3}, [r2], r3
+    vmovl.u8        q8, d0
+    vmovl.u8        q9, d1
+    vmovl.u8        q10, d2
+    vmovl.u8        q11, d3
+
+    vmovl.u8        q12, d4
+    vmovl.u8        q13, d5
+    vmovl.u8        q14, d6
+    vmovl.u8        q15, d7
+
+    vst1.u16        {q8, q9}, [r0]!
+    vst1.u16        {q10, q11}, [r0], r1
+    vst1.u16        {q12, q13}, [r0]!
+    vst1.u16        {q14, q15}, [r0], r1
+.endr
+    bne             loop_cps32
+    bx              lr
+endfunc
+
+function x265_blockcopy_ps_64x64_neon
+    lsl             r1, #1
+    sub             r1, #96
+    sub             r3, #32
+    mov             r12, #16
+loop_cps64:
+    subs            r12, #1
+.rept 4
+    vld1.u8         {q0, q1}, [r2]!
+    vld1.u8         {q2, q3}, [r2], r3
+    vmovl.u8        q8, d0
+    vmovl.u8        q9, d1
+    vmovl.u8        q10, d2
+    vmovl.u8        q11, d3
+
+    vmovl.u8        q12, d4
+    vmovl.u8        q13, d5
+    vmovl.u8        q14, d6
+    vmovl.u8        q15, d7
+
+    vst1.u16        {q8, q9}, [r0]!
+    vst1.u16        {q10, q11}, [r0]!
+    vst1.u16        {q12, q13}, [r0]!
+    vst1.u16        {q14, q15}, [r0], r1
+.endr
+    bne             loop_cps64
+    bx              lr
+endfunc
diff -r e7fe95178598 -r cec0be1e488f source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h	Fri Feb 26 14:56:46 2016 +0530
+++ b/source/common/arm/blockcopy8.h	Fri Feb 26 15:30:16 2016 +0530
@@ -62,4 +62,10 @@
 void x265_blockcopy_sp_16x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
 void x265_blockcopy_sp_32x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
 void x265_blockcopy_sp_64x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+void x265_blockcopy_ps_4x4_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_8x8_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_16x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_32x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_64x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
 #endif // ifndef X265_I386_PIXEL_ARM_H


More information about the x265-devel mailing list