[x265] [PATCH Review only] asm: code for sse_pp_12x16 routine
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Thu Nov 21 15:48:49 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1385045293 -19800
# Thu Nov 21 20:18:13 2013 +0530
# Node ID 289b23f3ec9e955bfdfd8a1d4ff1910a4f8a7e95
# Parent 44d15f8ce9403cc8c8d97bffee355c1e24ad1271
asm: code for sse_pp_12x16 routine
diff -r 44d15f8ce940 -r 289b23f3ec9e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 21 14:44:06 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 21 20:18:13 2013 +0530
@@ -564,6 +564,7 @@
PIXEL_AVG_W4(ssse3);
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+ p.sse_pp[LUMA_12x16] = x265_pixel_ssd_12x16_ssse3;
p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
diff -r 44d15f8ce940 -r 289b23f3ec9e source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Nov 21 14:44:06 2013 +0530
+++ b/source/common/x86/pixel-a.asm Thu Nov 21 20:18:13 2013 +0530
@@ -537,6 +537,86 @@
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
+; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2
+
+ pxor m6, m6
+ mov r4d, 4
+
+.loop
+ movu m0, [r0]
+ movu m1, [r2]
+ movu m2, [r0 + r1]
+ movu m3, [r2 + r3]
+
+ mova m4, m0
+ mova m5, m1
+ punpckhdq m4, m2
+ punpckhdq m5, m3
+
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+
+ pmaddwd m0, m0
+ pmaddwd m2, m2
+ pmaddwd m4, m4
+
+ paddd m6, m0
+ paddd m6, m2
+ paddd m6, m4
+
+ movu m0, [r0 + 2 * r1]
+ movu m1, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ movu m2, [r0 + r1]
+ movu m3, [r2 + r3]
+
+ mova m4, m0
+ mova m5, m1
+ punpckhdq m4, m2
+ punpckhdq m5, m3
+
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+
+ pmaddwd m0, m0
+ pmaddwd m2, m2
+ pmaddwd m4, m4
+
+ paddd m6, m0
+ paddd m6, m2
+ paddd m6, m4
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ dec r4d
+ jnz .loop
+
+ HADDD m6, m1
+ movd eax, m6
+
+ RET
+
+;-----------------------------------------------------------------------------
; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
diff -r 44d15f8ce940 -r 289b23f3ec9e source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Thu Nov 21 14:44:06 2013 +0530
+++ b/source/common/x86/pixel.h Thu Nov 21 20:18:13 2013 +0530
@@ -372,5 +372,6 @@
uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
+int x265_pixel_ssd_12x16_ssse3(pixel *, intptr_t, pixel *, intptr_t);
#endif // ifndef X265_I386_PIXEL_H
More information about the x265-devel
mailing list