[x265] [PATCH] asm: assembly code for pixel_sad_12x16
chen
chenm003 at 163.com
Wed Oct 30 17:03:11 CET 2013
+%macro PROCESS_SAD_12x4 0
+ movu m1, [r2]
+ movu m2, [r0]
+ pand m1, m4
+ pand m2, m4
+ psadbw m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ movu m1, [r2]
+ movu m2, [r0]
+ pand m1, m4
+ pand m2, m4
+ psadbw m1, m2
+ paddd m0, m1
>>+ lea r2, [r2 + r3]
>>+ lea r0, [r0 + r1]
>>+ movu m1, [r2]
>>+ movu m2, [r0]
we don't need to load address every time when we are adding stride to it. we should try to calculate address first using multiply by 1, 2, 4, or 8 if it not the case then we should load it.
like above four instruction can be replaced with these two only.
movu m1, [r2 + 2 * r3]
movu m2, [r0 + 2 * r1]
+ pand m1, m4
+ pand m2, m4
+ psadbw m1, m2
+ paddd m0, m1
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ movu m1, [r2]
+ movu m2, [r0]
+ pand m1, m4
+ pand m2, m4
+ psadbw m1, m2
+ paddd m0, m1
+%endmacro
+
%macro PROCESS_SAD_16x4 0
movu m1, [r2]
movu m2, [r2 + r3]
@@ -1007,6 +1041,29 @@
movd eax, m0
RET
+;-----------------------------------------------------------------------------
+; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
+;-----------------------------------------------------------------------------
+cglobal pixel_sad_12x16, 4,4,4
+ mova m4, [MSK]
+ pxor m0, m0
+
+ PROCESS_SAD_12x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_12x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_12x4
+ lea r2, [r2 + r3]
+ lea r0, [r0 + r1]
+ PROCESS_SAD_12x4
+
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+
%endmacro
overuse of lea instruction please eliminate them, use available registers to save loads operations.
Excuse me, I forgot something, for 12xN, use MOVQ+MOVD is better than MOVU+PAND
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131031/54a60561/attachment.html>
More information about the x265-devel
mailing list