[x265] [PATCH] arm: Implement pixel_sse_ss ARM NEON asm
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Feb 25 09:27:45 CET 2016
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1456382751 -19800
# Thu Feb 25 12:15:51 2016 +0530
# Node ID 4a1b8f3c0c7385ff19fd61133e0af4464510e9aa
# Parent 45c0dbd43dec24608199362a86bfba6ef91cacca
arm: Implement pixel_sse_ss ARM NEON asm
diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Mon Feb 22 18:22:37 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Thu Feb 25 12:15:51 2016 +0530
@@ -42,6 +42,13 @@
{
if (cpuMask & X265_CPU_NEON)
{
+ // sse_ss
+ p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_neon);
+ p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_neon);
+ p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_neon);
+ p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_neon);
+ p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_neon);
+
// pixel_sub_ps
p.cu[BLOCK_4x4].sub_ps = PFX(pixel_sub_ps_4x4_neon);
p.cu[BLOCK_8x8].sub_ps = PFX(pixel_sub_ps_8x8_neon);
diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Mon Feb 22 18:22:37 2016 +0530
+++ b/source/common/arm/pixel.h Thu Feb 25 12:15:51 2016 +0530
@@ -117,6 +117,12 @@
sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_4x4_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_8x8_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_16x16_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+
void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
diff -r 45c0dbd43dec -r 4a1b8f3c0c73 source/common/arm/ssd-a.S
--- a/source/common/arm/ssd-a.S Mon Feb 22 18:22:37 2016 +0530
+++ b/source/common/arm/ssd-a.S Thu Feb 25 12:15:51 2016 +0530
@@ -194,3 +194,181 @@
vmov.32 r0, d0[0]
bx lr
endfunc
+
+function x265_pixel_sse_ss_4x4_neon
+ add r1, r1
+ add r3, r3
+
+ vld1.s16 {d16}, [r0], r1
+ vld1.s16 {d18}, [r2], r3
+ vsub.s16 q2, q8, q9
+ vld1.s16 {d16}, [r0], r1
+ vmull.s16 q0, d4, d4
+ vld1.s16 {d18}, [r2], r3
+
+ vsub.s16 q2, q8, q9
+ vld1.s16 {d16}, [r0], r1
+ vmlal.s16 q0, d4, d4
+ vld1.s16 {d18}, [r2], r3
+
+ vsub.s16 q2, q8, q9
+ vld1.s16 {d16}, [r0], r1
+ vmlal.s16 q0, d4, d4
+ vld1.s16 {d18}, [r2], r3
+
+ vsub.s16 q2, q8, q9
+ vmlal.s16 q0, d4, d4
+
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_sse_ss_8x8_neon
+ add r1, r1
+ add r3, r3
+
+ vld1.s16 {q8}, [r0], r1
+ vld1.s16 {q9}, [r2], r3
+ vsub.s16 q8, q9
+ vmull.s16 q0, d16, d16
+ vmull.s16 q1, d17, d17
+
+.rept 7
+ vld1.s16 {q8}, [r0], r1
+ vld1.s16 {q9}, [r2], r3
+ vsub.s16 q8, q9
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+.endr
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_sse_ss_16x16_neon
+ add r1, r1
+ add r3, r3
+
+ mov r12, #4
+ veor.u8 q0, q0
+ veor.u8 q1, q1
+
+.loop_sse_ss_16:
+ subs r12, #1
+.rept 4
+ vld1.s16 {q8-q9}, [r0], r1
+ vld1.s16 {q10-q11}, [r2], r3
+ vsub.s16 q8, q10
+ vsub.s16 q9, q11
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+.endr
+ bne .loop_sse_ss_16
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_sse_ss_32x32_neon
+ add r1, r1
+ add r3, r3
+ sub r1, #32
+ sub r3, #32
+ mov r12, #8
+ veor.u8 q0, q0
+ veor.u8 q1, q1
+
+.loop_sse_ss_32:
+ subs r12, #1
+.rept 4
+ vld1.s16 {q8-q9}, [r0]!
+ vld1.s16 {q10-q11}, [r2]!
+ vsub.s16 q8, q10
+ vsub.s16 q9, q11
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+
+ vld1.s16 {q8-q9}, [r0], r1
+ vld1.s16 {q10-q11}, [r2], r3
+ vsub.s16 q8, q10
+ vsub.s16 q9, q11
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+.endr
+ bne .loop_sse_ss_32
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_sse_ss_64x64_neon
+ add r1, r1
+ add r3, r3
+ sub r1, #96
+ sub r3, #96
+ mov r12, #32
+ veor.u8 q0, q0
+ veor.u8 q1, q1
+
+.loop_sse_ss_64:
+ subs r12, #1
+.rept 2
+ vld1.s16 {q8-q9}, [r0]!
+ vld1.s16 {q10-q11}, [r2]!
+ vsub.s16 q8, q10
+ vsub.s16 q9, q11
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+
+ vld1.s16 {q8-q9}, [r0]!
+ vld1.s16 {q10-q11}, [r2]!
+ vsub.s16 q8, q10
+ vsub.s16 q9, q11
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+
+ vld1.s16 {q8-q9}, [r0]!
+ vld1.s16 {q10-q11}, [r2]!
+ vsub.s16 q8, q10
+ vsub.s16 q9, q11
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+
+ vld1.s16 {q8-q9}, [r0], r1
+ vld1.s16 {q10-q11}, [r2], r3
+ vsub.s16 q8, q10
+ vsub.s16 q9, q11
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+.endr
+ bne .loop_sse_ss_64
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+
More information about the x265-devel
mailing list