<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><pre>Just some tricky advice.</pre><pre>Register alloc rule:</pre><pre>1. R0-R3 may use free</pre><pre>2. if you need one more, R12 is free</pre><pre>3. if you need one more again, R14 (LR) may use after push to stack</pre><pre><span style="line-height: 1.7;">4. Then use R4-R11 with stack save/restore.</span>
</pre><pre><br></pre><pre>your patch use more registers because without apply above rule.</pre><pre><br>At 2016-04-20 19:15:36,radhakrishnan@multicorewareinc.com wrote:
># HG changeset patch
># User Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
># Date 1461145693 -19800
># Wed Apr 20 15:18:13 2016 +0530
># Node ID eed7e06770463bb86c28dade1f0e965215028064
># Parent a28ba6131b58829d04ffc04b9ac2c67bf850eee4
>arm: Implement sub_ps chroma ARM NEON
>
>diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp Tue Apr 19 12:12:00 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp Wed Apr 20 15:18:13 2016 +0530
>@@ -446,6 +446,16 @@
> p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
> p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_neon);
>
>+ // chroma sub_ps
>+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sub_ps = PFX(pixel_sub_ps_4x4_neon);
>+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sub_ps = PFX(pixel_sub_ps_8x8_neon);
>+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
>+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
>+ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sub_ps = PFX(pixel_sub_ps_4x8_neon);
>+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_neon);
>+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_neon);
>+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
>+
> // calc_Residual
> p.cu[BLOCK_4x4].calcresidual = PFX(getResidual4_neon);
> p.cu[BLOCK_8x8].calcresidual = PFX(getResidual8_neon);
>diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel-util.S
>--- a/source/common/arm/pixel-util.S Tue Apr 19 12:12:00 2016 +0530
>+++ b/source/common/arm/pixel-util.S Wed Apr 20 15:18:13 2016 +0530
>@@ -439,6 +439,94 @@
> bx lr
> endfunc
>
>+// chroma sub_ps
>+function x265_pixel_sub_ps_4x8_neon
>+ push {r4}
>+ lsl r1, r1, #1
>+ ldr r4, [sp, #4]
>+ ldr r12, [sp, #8]
>+.rept 4
>+ vld1.u8 {d0}, [r2], r4
>+ vld1.u8 {d1}, [r3], r12
>+ vld1.u8 {d2}, [r2], r4
>+ vld1.u8 {d3}, [r3], r12
>+ vsubl.u8 q2, d0, d1
>+ vsubl.u8 q3, d2, d3
>+ vst1.s16 {d4}, [r0], r1
>+ vst1.s16 {d6}, [r0], r1
>+.endr
>+ pop {r4}
>+ bx lr
>+endfunc
>+
>+function x265_pixel_sub_ps_8x16_neon
>+ push {r4}
>+ lsl r1, r1, #1
>+ ldr r4, [sp, #4]
>+ ldr r12, [sp, #8]
>+.rept 8
>+ vld1.u8 {d0}, [r2], r4
>+ vld1.u8 {d1}, [r3], r12
>+ vld1.u8 {d2}, [r2], r4
>+ vld1.u8 {d3}, [r3], r12
>+ vsubl.u8 q2, d0, d1
>+ vsubl.u8 q3, d2, d3
>+ vst1.s16 {q2}, [r0], r1
>+ vst1.s16 {q3}, [r0], r1
>+.endr
>+ pop {r4}
>+ bx lr
>+endfunc
>+
>+function x265_pixel_sub_ps_16x32_neon
>+ push {r4, r5}
>+ lsl r1, r1, #1
>+ ldr r4, [sp, #8]
>+ ldr r12, [sp, #12]
>+ mov r5, #4
>+loop_sub_16x32:
>+ subs r5, r5, #1
>+.rept 4
>+ vld1.u8 {q0}, [r2], r4
>+ vld1.u8 {q1}, [r3], r12
>+ vld1.u8 {q2}, [r2], r4
>+ vld1.u8 {q3}, [r3], r12
>+ vsubl.u8 q8, d0, d2
>+ vsubl.u8 q9, d1, d3
>+ vsubl.u8 q10, d4, d6
>+ vsubl.u8 q11, d5, d7
>+ vst1.s16 {q8, q9}, [r0], r1
>+ vst1.s16 {q10, q11}, [r0], r1
>+.endr
>+ bne loop_sub_16x32
>+ pop {r4, r5}
>+ bx lr
>+endfunc
>+
>+function x265_pixel_sub_ps_32x64_neon
>+ push {r4, r5}
>+ lsl r1, r1, #1
>+ ldr r4, [sp, #8]
>+ ldr r12, [sp, #12]
>+ sub r1, #32
>+ mov r5, #16
>+loop_sub_32x64:
>+ subs r5, r5, #1
>+.rept 4
>+ vld1.u8 {q0, q1}, [r2], r4
>+ vld1.u8 {q2, q3}, [r3], r12
>+ vsubl.u8 q8, d0, d4
>+ vsubl.u8 q9, d1, d5
>+ vsubl.u8 q10, d2, d6
>+ vsubl.u8 q11, d3, d7
>+ vst1.s16 {q8, q9}, [r0]!
>+ vst1.s16 {q10, q11}, [r0], r1
>+.endr
>+ bne loop_sub_32x64
>+ pop {r4, r5}
>+ bx lr
>+endfunc
>+
> // void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
> function x265_pixel_add_ps_4x4_neon
> push {r4}
>diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel.h
>--- a/source/common/arm/pixel.h Tue Apr 19 12:12:00 2016 +0530
>+++ b/source/common/arm/pixel.h Wed Apr 20 15:18:13 2016 +0530
>@@ -157,6 +157,10 @@
> void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
> void x265_pixel_sub_ps_32x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
> void x265_pixel_sub_ps_64x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_4x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_8x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_16x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_32x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>
> void x265_pixel_add_ps_4x4_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
> void x265_pixel_add_ps_8x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>