[x265] [PATCH] arm: Implement pixel_ssd_s ARM NEON asm

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Feb 25 09:28:26 CET 2016


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1456136894 -19800
#      Mon Feb 22 15:58:14 2016 +0530
# Node ID ed3dd1a26cb5801e306db8f1d4a52cd1f4d6620b
# Parent  4a1b8f3c0c7385ff19fd61133e0af4464510e9aa
arm: Implement pixel_ssd_s ARM NEON asm

diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp	Thu Feb 25 12:15:51 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp	Mon Feb 22 15:58:14 2016 +0530
@@ -42,6 +42,12 @@
 {
     if (cpuMask & X265_CPU_NEON)
     {
+        // ssd_s
+        p.cu[BLOCK_4x4].ssd_s   = PFX(pixel_ssd_s_4x4_neon);
+        p.cu[BLOCK_8x8].ssd_s   = PFX(pixel_ssd_s_8x8_neon);
+        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon);
+        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon);
+
         // sse_ss
         p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_neon);
         p.cu[BLOCK_8x8].sse_ss   = PFX(pixel_sse_ss_8x8_neon);
diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h	Thu Feb 25 12:15:51 2016 +0530
+++ b/source/common/arm/pixel.h	Mon Feb 22 15:58:14 2016 +0530
@@ -123,6 +123,12 @@
 sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
 sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
 
+sse_t x265_pixel_ssd_s_4x4_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_8x8_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_16x16_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_32x32_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_64x64_neon(const int16_t* a, intptr_t dstride);
+
 void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/ssd-a.S
--- a/source/common/arm/ssd-a.S	Thu Feb 25 12:15:51 2016 +0530
+++ b/source/common/arm/ssd-a.S	Mon Feb 22 15:58:14 2016 +0530
@@ -371,4 +371,99 @@
     bx          lr
 endfunc
 
+function x265_pixel_ssd_s_4x4_neon
+    add         r1, r1
+    vld1.s16    {d4}, [r0], r1
+    vld1.s16    {d5}, [r0], r1
+    vld1.s16    {d6}, [r0], r1
+    vld1.s16    {d7}, [r0]
+    vmull.s16   q0, d4, d4
+    vmull.s16   q1, d5, d5
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q1, d7, d7
+    vadd.s32    q0, q1
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+endfunc
 
+function x265_pixel_ssd_s_8x8_neon
+    add         r1, r1
+    vld1.s16    {q8}, [r0], r1
+    vld1.s16    {q9}, [r0], r1
+    vmull.s16   q0, d16, d16
+    vmull.s16   q1, d17, d17
+    vmlal.s16   q0, d18, d18
+    vmlal.s16   q1, d19, d19
+.rept 3
+    vld1.s16    {q8}, [r0], r1
+    vld1.s16    {q9}, [r0], r1
+    vmlal.s16   q0, d16, d16
+    vmlal.s16   q1, d17, d17
+    vmlal.s16   q0, d18, d18
+    vmlal.s16   q1, d19, d19
+.endr
+    vadd.s32    q0, q1
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+endfunc
+
+function x265_pixel_ssd_s_16x16_neon
+    add         r1, r1
+    mov         r12, #4
+    veor.u8     q0, q0
+    veor.u8     q1, q1
+
+.loop_ssd_s_16:
+    subs        r12, #1
+.rept 2
+    vld1.s16    {q8-q9}, [r0], r1
+    vld1.s16    {q10-q11}, [r0], r1
+    vmlal.s16   q0, d16, d16
+    vmlal.s16   q1, d17, d17
+    vmlal.s16   q0, d18, d18
+    vmlal.s16   q1, d19, d19
+    vmlal.s16   q0, d20, d20
+    vmlal.s16   q1, d21, d21
+    vmlal.s16   q0, d22, d22
+    vmlal.s16   q1, d23, d23
+.endr
+    bne         .loop_ssd_s_16
+    vadd.s32    q0, q1
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+endfunc
+
+function x265_pixel_ssd_s_32x32_neon
+    add         r1, r1
+    sub         r1, #32
+    mov         r12, #8
+    veor.u8     q0, q0
+    veor.u8     q1, q1
+
+.loop_ssd_s_32:
+    subs        r12, #1
+.rept 4
+    vld1.s16    {q8-q9}, [r0]!
+    vld1.s16    {q10-q11}, [r0], r1
+    vmlal.s16   q0, d16, d16
+    vmlal.s16   q1, d17, d17
+    vmlal.s16   q0, d18, d18
+    vmlal.s16   q1, d19, d19
+    vmlal.s16   q0, d20, d20
+    vmlal.s16   q1, d21, d21
+    vmlal.s16   q0, d22, d22
+    vmlal.s16   q1, d23, d23
+.endr
+    bne         .loop_ssd_s_32
+    vadd.s32    q0, q1
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+endfunc


More information about the x265-devel mailing list