[x265] [PATCH] arm: Implement sub_ps chroma ARM NEON

chen chenm003 at 163.com
Wed Apr 20 18:15:07 CEST 2016


Just some tricky advice.Register alloc rule:1. R0-R3 may use free2. if you need one more, R12 is free3. if you need one more again, R14 (LR) may use after push to stack4. Then use R4-R11 with stack save/restore.

your patch use more registers because without apply above rule.

At 2016-04-20 19:15:36,radhakrishnan at multicorewareinc.com wrote:
># HG changeset patch
># User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
># Date 1461145693 -19800
>#      Wed Apr 20 15:18:13 2016 +0530
># Node ID eed7e06770463bb86c28dade1f0e965215028064
># Parent  a28ba6131b58829d04ffc04b9ac2c67bf850eee4
>arm: Implement sub_ps chroma ARM NEON
>
>diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp	Tue Apr 19 12:12:00 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp	Wed Apr 20 15:18:13 2016 +0530
>@@ -446,6 +446,16 @@
>         p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
>         p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_neon);
> 
>+        // chroma sub_ps
>+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sub_ps   = PFX(pixel_sub_ps_4x4_neon);
>+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sub_ps   = PFX(pixel_sub_ps_8x8_neon);
>+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
>+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
>+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sub_ps   = PFX(pixel_sub_ps_4x8_neon);
>+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps  = PFX(pixel_sub_ps_8x16_neon);
>+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_neon);
>+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
>+
>         // calc_Residual
>         p.cu[BLOCK_4x4].calcresidual   = PFX(getResidual4_neon);
>         p.cu[BLOCK_8x8].calcresidual   = PFX(getResidual8_neon);
>diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel-util.S
>--- a/source/common/arm/pixel-util.S	Tue Apr 19 12:12:00 2016 +0530
>+++ b/source/common/arm/pixel-util.S	Wed Apr 20 15:18:13 2016 +0530
>@@ -439,6 +439,94 @@
>     bx              lr
> endfunc
> 
>+// chroma sub_ps
>+function x265_pixel_sub_ps_4x8_neon
>+    push            {r4}
>+    lsl             r1, r1, #1
>+    ldr             r4, [sp, #4]
>+    ldr             r12, [sp, #8]
>+.rept 4
>+    vld1.u8         {d0}, [r2], r4
>+    vld1.u8         {d1}, [r3], r12
>+    vld1.u8         {d2}, [r2], r4
>+    vld1.u8         {d3}, [r3], r12
>+    vsubl.u8        q2, d0, d1
>+    vsubl.u8        q3, d2, d3
>+    vst1.s16        {d4}, [r0], r1
>+    vst1.s16        {d6}, [r0], r1
>+.endr
>+    pop             {r4}
>+    bx              lr
>+endfunc
>+
>+function x265_pixel_sub_ps_8x16_neon
>+    push            {r4}
>+    lsl             r1, r1, #1
>+    ldr             r4, [sp, #4]
>+    ldr             r12, [sp, #8]
>+.rept 8
>+    vld1.u8         {d0}, [r2], r4
>+    vld1.u8         {d1}, [r3], r12
>+    vld1.u8         {d2}, [r2], r4
>+    vld1.u8         {d3}, [r3], r12
>+    vsubl.u8        q2, d0, d1
>+    vsubl.u8        q3, d2, d3
>+    vst1.s16        {q2}, [r0], r1
>+    vst1.s16        {q3}, [r0], r1
>+.endr
>+    pop             {r4}
>+    bx              lr
>+endfunc
>+
>+function x265_pixel_sub_ps_16x32_neon
>+    push            {r4, r5}
>+    lsl             r1, r1, #1
>+    ldr             r4, [sp, #8]
>+    ldr             r12, [sp, #12]
>+    mov             r5, #4
>+loop_sub_16x32:
>+    subs            r5, r5, #1
>+.rept 4
>+    vld1.u8         {q0}, [r2], r4
>+    vld1.u8         {q1}, [r3], r12
>+    vld1.u8         {q2}, [r2], r4
>+    vld1.u8         {q3}, [r3], r12
>+    vsubl.u8        q8, d0, d2
>+    vsubl.u8        q9, d1, d3
>+    vsubl.u8        q10, d4, d6
>+    vsubl.u8        q11, d5, d7
>+    vst1.s16        {q8, q9}, [r0], r1
>+    vst1.s16        {q10, q11}, [r0], r1
>+.endr
>+    bne             loop_sub_16x32
>+    pop             {r4, r5}
>+    bx              lr
>+endfunc
>+
>+function x265_pixel_sub_ps_32x64_neon
>+    push            {r4, r5}
>+    lsl             r1, r1, #1
>+    ldr             r4, [sp, #8]
>+    ldr             r12, [sp, #12]
>+    sub             r1, #32
>+    mov             r5, #16
>+loop_sub_32x64:
>+    subs            r5, r5, #1
>+.rept 4
>+    vld1.u8         {q0, q1}, [r2], r4
>+    vld1.u8         {q2, q3}, [r3], r12
>+    vsubl.u8        q8, d0, d4
>+    vsubl.u8        q9, d1, d5
>+    vsubl.u8        q10, d2, d6
>+    vsubl.u8        q11, d3, d7
>+    vst1.s16        {q8, q9}, [r0]!
>+    vst1.s16        {q10, q11}, [r0], r1
>+.endr
>+    bne             loop_sub_32x64
>+    pop             {r4, r5}
>+    bx              lr
>+endfunc
>+
> // void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
> function x265_pixel_add_ps_4x4_neon
>     push            {r4}
>diff -r a28ba6131b58 -r eed7e0677046 source/common/arm/pixel.h
>--- a/source/common/arm/pixel.h	Tue Apr 19 12:12:00 2016 +0530
>+++ b/source/common/arm/pixel.h	Wed Apr 20 15:18:13 2016 +0530
>@@ -157,6 +157,10 @@
> void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
> void x265_pixel_sub_ps_32x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
> void x265_pixel_sub_ps_64x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_4x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_8x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_16x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
>+void x265_pixel_sub_ps_32x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
> 
> void x265_pixel_add_ps_4x4_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
> void x265_pixel_add_ps_8x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160421/f01f5e96/attachment-0001.html>


More information about the x265-devel mailing list