[x265] [PATCH] arm: Implement sub_ps chroma ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Wed Apr 20 13:15:36 CEST 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1461145693 -19800
# Wed Apr 20 15:18:13 2016 +0530
# Node ID eed7e06770463bb86c28dade1f0e965215028064
# Parent a28ba6131b58829d04ffc04b9ac2c67bf850eee4
arm: Implement sub_ps chroma ARM NEON
diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Tue Apr 19 12:12:00 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Wed Apr 20 15:18:13 2016 +0530
@@ -446,6 +446,16 @@
p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_neon);
+ // chroma sub_ps
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sub_ps = PFX(pixel_sub_ps_4x4_neon);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sub_ps = PFX(pixel_sub_ps_8x8_neon);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sub_ps = PFX(pixel_sub_ps_4x8_neon);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_neon);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_neon);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
+
// calc_Residual
p.cu[BLOCK_4x4].calcresidual = PFX(getResidual4_neon);
p.cu[BLOCK_8x8].calcresidual = PFX(getResidual8_neon);
diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Tue Apr 19 12:12:00 2016 +0530
+++ b/source/common/arm/pixel-util.S Wed Apr 20 15:18:13 2016 +0530
@@ -439,6 +439,94 @@
bx lr
endfunc
+// chroma sub_ps
+function x265_pixel_sub_ps_4x8_neon
+ push {r4}
+ lsl r1, r1, #1
+ ldr r4, [sp, #4]
+ ldr r12, [sp, #8]
+.rept 4
+ vld1.u8 {d0}, [r2], r4
+ vld1.u8 {d1}, [r3], r12
+ vld1.u8 {d2}, [r2], r4
+ vld1.u8 {d3}, [r3], r12
+ vsubl.u8 q2, d0, d1
+ vsubl.u8 q3, d2, d3
+ vst1.s16 {d4}, [r0], r1
+ vst1.s16 {d6}, [r0], r1
+.endr
+ pop {r4}
+ bx lr
+endfunc
+
+function x265_pixel_sub_ps_8x16_neon
+ push {r4}
+ lsl r1, r1, #1
+ ldr r4, [sp, #4]
+ ldr r12, [sp, #8]
+.rept 8
+ vld1.u8 {d0}, [r2], r4
+ vld1.u8 {d1}, [r3], r12
+ vld1.u8 {d2}, [r2], r4
+ vld1.u8 {d3}, [r3], r12
+ vsubl.u8 q2, d0, d1
+ vsubl.u8 q3, d2, d3
+ vst1.s16 {q2}, [r0], r1
+ vst1.s16 {q3}, [r0], r1
+.endr
+ pop {r4}
+ bx lr
+endfunc
+
+function x265_pixel_sub_ps_16x32_neon
+ push {r4, r5}
+ lsl r1, r1, #1
+ ldr r4, [sp, #8]
+ ldr r12, [sp, #12]
+ mov r5, #4
+loop_sub_16x32:
+ subs r5, r5, #1
+.rept 4
+ vld1.u8 {q0}, [r2], r4
+ vld1.u8 {q1}, [r3], r12
+ vld1.u8 {q2}, [r2], r4
+ vld1.u8 {q3}, [r3], r12
+ vsubl.u8 q8, d0, d2
+ vsubl.u8 q9, d1, d3
+ vsubl.u8 q10, d4, d6
+ vsubl.u8 q11, d5, d7
+ vst1.s16 {q8, q9}, [r0], r1
+ vst1.s16 {q10, q11}, [r0], r1
+.endr
+ bne loop_sub_16x32
+ pop {r4, r5}
+ bx lr
+endfunc
+
+function x265_pixel_sub_ps_32x64_neon
+ push {r4, r5}
+ lsl r1, r1, #1
+ ldr r4, [sp, #8]
+ ldr r12, [sp, #12]
+ sub r1, #32
+ mov r5, #16
+loop_sub_32x64:
+ subs r5, r5, #1
+.rept 4
+ vld1.u8 {q0, q1}, [r2], r4
+ vld1.u8 {q2, q3}, [r3], r12
+ vsubl.u8 q8, d0, d4
+ vsubl.u8 q9, d1, d5
+ vsubl.u8 q10, d2, d6
+ vsubl.u8 q11, d3, d7
+ vst1.s16 {q8, q9}, [r0]!
+ vst1.s16 {q10, q11}, [r0], r1
+.endr
+ bne loop_sub_32x64
+ pop {r4, r5}
+ bx lr
+endfunc
+
// void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
function x265_pixel_add_ps_4x4_neon
push {r4}
diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Tue Apr 19 12:12:00 2016 +0530
+++ b/source/common/arm/pixel.h Wed Apr 20 15:18:13 2016 +0530
@@ -157,6 +157,10 @@
void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_sub_ps_32x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_sub_ps_64x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_4x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_8x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_16x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_sub_ps_32x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_add_ps_4x4_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_add_ps_8x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
More information about the x265-devel
mailing list