<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><pre>Just some tricky advice.</pre><pre>Register alloc rule:</pre><pre>1. R0-R3 may use free</pre><pre>2. if you need one more, R12 is free</pre><pre>3. if you need one more again, R14 (LR) may use after push to stack</pre><pre><span style="line-height: 1.7;">4. Then use R4-R11 with stack save/restore.</span>
</pre><pre><br></pre><pre>your patch use more registers because without apply above rule.</pre><pre><br>At 2016-04-20 19:15:36,radhakrishnan@multicorewareinc.com wrote:
># HG changeset patch
># User Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
># Date 1461145693 -19800
>#      Wed Apr 20 15:18:13 2016 +0530
># Node ID eed7e06770463bb86c28dade1f0e965215028064
># Parent  a28ba6131b58829d04ffc04b9ac2c67bf850eee4
>arm: Implement sub_ps chroma ARM NEON
>
>diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp     Tue Apr 19 12:12:00 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp     Wed Apr 20 15:18:13 2016 +0530
>@@ -446,6 +446,16 @@
>         p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
>         p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_neon);

>+        // chroma sub_ps
>+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sub_ps   = PFX(pixel_sub_ps_4x4_neon);
>+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sub_ps   = PFX(pixel_sub_ps_8x8_neon);
>+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
>+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
>+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sub_ps   = PFX(pixel_sub_ps_4x8_neon);
>+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps  = PFX(pixel_sub_ps_8x16_neon);
>+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_neon);
>+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
>+
>         // calc_Residual
>         p.cu[BLOCK_4x4].calcresidual   = PFX(getResidual4_neon);
>         p.cu[BLOCK_8x8].calcresidual   = PFX(getResidual8_neon);
>diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel-util.S
>--- a/source/common/arm/pixel-util.S   Tue Apr 19 12:12:00 2016 +0530
>+++ b/source/common/arm/pixel-util.S   Wed Apr 20 15:18:13 2016 +0530
>@@ -439,6 +439,94 @@
>     bx              lr
> endfunc

>+// chroma sub_ps
>+function x265_pixel_sub_ps_4x8_neon
>+    push            {r4}
>+    lsl             r1, r1, #1
>+    ldr             r4, [sp, #4]
>+    ldr             r12, [sp, #8]
>+.rept 4
>+    vld1.u8         {d0}, [r2], r4
>+    vld1.u8         {d1}, [r3], r12
>+    vld1.u8         {d2}, [r2], r4
>+    vld1.u8         {d3}, [r3], r12
>+    vsubl.u8        q2, d0, d1
>+    vsubl.u8        q3, d2, d3
>+    vst1.s16        {d4}, [r0], r1
>+    vst1.s16        {d6}, [r0], r1
>+.endr
>+    pop             {r4}
>+    bx              lr
>+endfunc
>+
>+function x265_pixel_sub_ps_8x16_neon
>+    push            {r4}
>+    lsl             r1, r1, #1
>+    ldr             r4, [sp, #4]
>+    ldr             r12, [sp, #8]
>+.rept 8
>+    vld1.u8         {d0}, [r2], r4
>+    vld1.u8         {d1}, [r3], r12
>+    vld1.u8         {d2}, [r2], r4
>+    vld1.u8         {d3}, [r3], r12
>+    vsubl.u8        q2, d0, d1
>+    vsubl.u8        q3, d2, d3
>+    vst1.s16        {q2}, [r0], r1
>+    vst1.s16        {q3}, [r0], r1
>+.endr
>+    pop             {r4}
>+    bx              lr
>+endfunc
>+
>+function x265_pixel_sub_ps_16x32_neon
>+    push            {r4, r5}
>+    lsl             r1, r1, #1
>+    ldr             r4, [sp, #8]
>+    ldr             r12, [sp, #12]
>+    mov             r5, #4
>+loop_sub_16x32:
>+    subs            r5, r5, #1
>+.rept 4
>+    vld1.u8         {q0}, [r2], r4
>+    vld1.u8         {q1}, [r3], r12
>+    vld1.u8         {q2}, [r2], r4
>+    vld1.u8         {q3}, [r3], r12
>+    vsubl.u8        q8, d0, d2
>+    vsubl.u8        q9, d1, d3
>+    vsubl.u8        q10, d4, d6
>+    vsubl.u8        q11, d5, d7
>+    vst1.s16        {q8, q9}, [r0], r1
>+    vst1.s16        {q10, q11}, [r0], r1
>+.endr
>+    bne             loop_sub_16x32
>+    pop             {r4, r5}
>+    bx              lr
>+endfunc
>+
>+function x265_pixel_sub_ps_32x64_neon
>+    push            {r4, r5}
>+    lsl             r1, r1, #1
>+    ldr             r4, [sp, #8]
>+    ldr             r12, [sp, #12]
>+    sub             r1, #32
>+    mov             r5, #16
>+loop_sub_32x64:
>+    subs            r5, r5, #1
>+.rept 4
>+    vld1.u8         {q0, q1}, [r2], r4
>+    vld1.u8         {q2, q3}, [r3], r12
>+    vsubl.u8        q8, d0, d4
>+    vsubl.u8        q9, d1, d5
>+    vsubl.u8        q10, d2, d6
>+    vsubl.u8        q11, d3, d7
>+    vst1.s16        {q8, q9}, [r0]!
>+    vst1.s16        {q10, q11}, [r0], r1
>+.endr
>+    bne             loop_sub_32x64
>+    pop             {r4, r5}
>+    bx              lr
>+endfunc
>+
> // void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
> function x265_pixel_add_ps_4x4_neon
>     push            {r4}
>diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel.h
>--- a/source/common/arm/pixel.h        Tue Apr 19 12:12:00 2016 +0530
>+++ b/source/common/arm/pixel.h        Wed Apr 20 15:18:13 2016 +0530
>@@ -157,6 +157,10 @@
> void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
> void x265_pixel_sub_ps_32x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
> void x265_pixel_sub_ps_64x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_4x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_8x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_16x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_32x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);

> void x265_pixel_add_ps_4x4_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
> void x265_pixel_add_ps_8x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>