[x265] [PATCH] arm: Implement add_ps chroma ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Wed Apr 20 13:15:53 CEST 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1461146985 -19800
# Wed Apr 20 15:39:45 2016 +0530
# Node ID d5b5c334f9f74f79804d17352515ee89707f8bd1
# Parent eed7e06770463bb86c28dade1f0e965215028064
arm: Implement add_ps chroma ARM NEON
diff -r eed7e0677046 -r d5b5c334f9f7 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Wed Apr 20 15:18:13 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Wed Apr 20 15:39:45 2016 +0530
@@ -420,6 +420,16 @@
p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_neon);
+ // chroma add_ps
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps = PFX(pixel_add_ps_4x4_neon);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps = PFX(pixel_add_ps_8x8_neon);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps = PFX(pixel_add_ps_4x8_neon);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps = PFX(pixel_add_ps_8x16_neon);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_neon);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_neon);
+
// cpy2Dto1D_shr
p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
diff -r eed7e0677046 -r d5b5c334f9f7 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Wed Apr 20 15:18:13 2016 +0530
+++ b/source/common/arm/pixel-util.S Wed Apr 20 15:39:45 2016 +0530
@@ -580,15 +580,16 @@
bx lr
endfunc
-function x265_pixel_add_ps_16x16_neon
+.macro pixel_add_ps_16xN_neon h i
+function x265_pixel_add_ps_16x\h\()_neon
push {r4, r5}
ldr r4, [sp, #8]
ldr r12, [sp, #12]
lsl r12, #1
vmov.u16 q10, #255
veor.u16 q11, q11
- mov r5, #2
-loop_addps16:
+ mov r5, #\i
+loop_addps_16x\h\():
subs r5, #1
.rept 4
vld1.u8 {q0}, [r2], r4
@@ -613,23 +614,28 @@
vst1.8 {d4, d5}, [r0], r1
vst1.8 {d0, d1}, [r0], r1
.endr
- bne loop_addps16
+ bne loop_addps_16x\h
pop {r4, r5}
bx lr
endfunc
+.endm
- function x265_pixel_add_ps_32x32_neon
+pixel_add_ps_16xN_neon 16 2
+pixel_add_ps_16xN_neon 32 4
+
+.macro pixel_add_ps_32xN_neon h i
+ function x265_pixel_add_ps_32x\h\()_neon
push {r4, r5}
ldr r4, [sp, #8]
ldr r12, [sp, #12]
lsl r12, #1
vmov.u16 q10, #255
veor.u16 q11, q11
- mov r5, #4
+ mov r5, #\i
sub r12, #32
-loop_addps32:
+loop_addps_32x\h\():
subs r5, #1
-.rept 8
+.rept 4
vld1.u8 {q0, q1}, [r2], r4
vld1.s16 {q8, q9}, [r3]!
vld1.s16 {q12, q13}, [r3], r12
@@ -650,10 +656,14 @@
vqmovun.s16 d3, q15
vst1.8 {q0, q1}, [r0], r1
.endr
- bne loop_addps32
+ bne loop_addps_32x\h
pop {r4, r5}
bx lr
endfunc
+.endm
+
+pixel_add_ps_32xN_neon 32 8
+pixel_add_ps_32xN_neon 64 16
function x265_pixel_add_ps_64x64_neon
push {r4, r5}
@@ -715,6 +725,63 @@
bx lr
endfunc
+// Chroma add_ps
+function x265_pixel_add_ps_4x8_neon
+ push {r4}
+ ldr r4, [sp, #4]
+ ldr r12, [sp, #8]
+ lsl r12, #1
+ vmov.u16 q10, #255
+ veor.u16 q11, q11
+ veor.u16 d3, d3
+ veor.u16 d5, d5
+.rept 4
+ vld1.u8 {d0}, [r2], r4
+ vld1.u8 {d1}, [r2], r4
+ vld1.s16 {d2}, [r3], r12
+ vld1.s16 {d4}, [r3], r12
+ vmovl.u8 q8, d0
+ vmovl.u8 q9, d1
+ vadd.s16 q1, q1, q8
+ vadd.s16 q2, q2, q9
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+.endr
+ pop {r4}
+ bx lr
+endfunc
+
+function x265_pixel_add_ps_8x16_neon
+ push {r4, r5}
+ ldr r4, [sp, #8]
+ ldr r12, [sp, #12]
+ lsl r12, #1
+ vmov.u16 q10, #255
+ veor.u16 q11, q11
+ mov r5, #2
+loop_add_8x16:
+ subs r5, #1
+.rept 4
+ vld1.u8 {d0}, [r2], r4
+ vld1.u8 {d1}, [r2], r4
+ vld1.s16 {q8}, [r3], r12
+ vld1.s16 {q9}, [r3], r12
+ vmovl.u8 q1, d0
+ vmovl.u8 q2, d1
+ vadd.s16 q1, q1, q8
+ vadd.s16 q2, q2, q9
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d1}, [r0], r1
+.endr
+ bne loop_add_8x16
+ pop {r4, r5}
+ bx lr
+endfunc
+
// void scale1D_128to64(pixel *dst, const pixel *src)
function x265_scale1D_128to64_neon
mov r12, #32
diff -r eed7e0677046 -r d5b5c334f9f7 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Wed Apr 20 15:18:13 2016 +0530
+++ b/source/common/arm/pixel.h Wed Apr 20 15:39:45 2016 +0530
@@ -167,6 +167,10 @@
void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_4x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_8x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_16x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_32x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
#endif // ifndef X265_I386_PIXEL_ARM_H
More information about the x265-devel
mailing list