[x265] [PATCH] arm: Implement blockfill_s_neon ARM NEON

radhakrishnan at multicorewareinc.com radhakrishnan at multicorewareinc.com
Tue Mar 1 09:50:49 CET 2016


# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1456814898 -19800
#      Tue Mar 01 12:18:18 2016 +0530
# Node ID 79c00b9bc2b81afef2e41526fc3c390528f3174c
# Parent  291beccb67606494a9a144ca2cc4411ab3e21e50
arm: Implement blockfill_s_neon ARM NEON

diff -r 291beccb6760 -r 79c00b9bc2b8 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp	Fri Feb 26 16:23:56 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp	Tue Mar 01 12:18:18 2016 +0530
@@ -42,6 +42,12 @@
 {
     if (cpuMask & X265_CPU_NEON)
     {
+        // Block_fill
+        p.cu[BLOCK_4x4].blockfill_s   = PFX(blockfill_s_4x4_neon);
+        p.cu[BLOCK_8x8].blockfill_s   = PFX(blockfill_s_8x8_neon);
+        p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_neon);
+        p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_neon);
+
         // Blockcopy_ss
         p.cu[BLOCK_4x4].copy_ss   = PFX(blockcopy_ss_4x4_neon);
         p.cu[BLOCK_8x8].copy_ss   = PFX(blockcopy_ss_8x8_neon);
diff -r 291beccb6760 -r 79c00b9bc2b8 source/common/arm/blockcopy8.S
--- a/source/common/arm/blockcopy8.S	Fri Feb 26 16:23:56 2016 +0530
+++ b/source/common/arm/blockcopy8.S	Tue Mar 01 12:18:18 2016 +0530
@@ -311,3 +311,52 @@
     bne             loop_css64
     bx              lr
 endfunc
+
+// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
+function x265_blockfill_s_4x4_neon
+    lsl             r1, #1
+.rept 2
+    vdup.u16        d0, r2
+    vdup.u16        d1, r2
+    vst1.16         {d0}, [r0], r1
+    vst1.16         {d1}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockfill_s_8x8_neon
+    lsl             r1, #1
+.rept 4
+    vdup.u16        q0, r2
+    vdup.u16        q1, r2
+    vst1.16         {q0}, [r0], r1
+    vst1.16         {q1}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockfill_s_16x16_neon
+    lsl             r1, #1
+.rept 16
+    vdup.u16        q0, r2
+    vdup.u16        q1, r2
+    vst1.16         {q0, q1}, [r0], r1
+.endr
+    bx              lr
+endfunc
+
+function x265_blockfill_s_32x32_neon
+    lsl             r1, #1
+    sub             r1, #32
+    mov             r12, #4
+loop_fill32:
+    subs            r12, #1
+.rept 8
+    vdup.u16        q0, r2
+    vdup.u16        q1, r2
+    vst1.16         {q0, q1}, [r0]!
+    vst1.16         {q0, q1}, [r0], r1
+.endr
+    bne             loop_fill32
+    bx              lr
+endfunc
diff -r 291beccb6760 -r 79c00b9bc2b8 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h	Fri Feb 26 16:23:56 2016 +0530
+++ b/source/common/arm/blockcopy8.h	Tue Mar 01 12:18:18 2016 +0530
@@ -74,4 +74,9 @@
 void x265_blockcopy_ss_16x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
 void x265_blockcopy_ss_32x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
 void x265_blockcopy_ss_64x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+void x265_blockfill_s_4x4_neon(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_8x8_neon(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_16x16_neon(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_32x32_neon(int16_t* dst, intptr_t dstride, int16_t val);
 #endif // ifndef X265_I386_PIXEL_ARM_H


More information about the x265-devel mailing list