2590 down to 2139.<br><br><div class="gmail_quote">On Thu, Feb 2, 2012 at 4:41 PM, George Stephanos <span dir="ltr"><<a href="mailto:gaf.stephanos@gmail.com">gaf.stephanos@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
---<br>
common/arm/pixel-a.S | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++<br>
<div class="im"> common/arm/pixel.h | 2 +<br>
common/pixel.c | 1 +<br>
</div> 3 files changed, 79 insertions(+), 0 deletions(-)<br>
<br>
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S<br>
index db32671..521c7c6 100644<br>
--- a/common/arm/pixel-a.S<br>
+++ b/common/arm/pixel-a.S<br>
@@ -1414,3 +1414,79 @@ function x264_intra_sad_x3_8x8c_neon<br>
<div class="im"><br>
pop {r4, pc}<br>
.endfunc<br>
+<br>
+function x264_intra_sad_x3_16x16_neon<br>
+ push {r4, lr}<br>
</div>+ vmov.i8 q8, #0<br>
<div class="im">+ sub lr, r1, #FDEC_STRIDE<br>
+ mov r3, #FENC_STRIDE<br>
+ vld1.8 {q0}, [lr]<br>
+ mov r4, #FDEC_STRIDE<br>
+ sub lr, r1, #1<br>
+<br>
</div>+.set Y, 0<br>
<div class="im">+.rept 16<br>
+ vld1.8 {q1}, [r0], r3<br>
</div>+ vld1.8 {d28[]}, [lr], r4<br>
+.if Y == 0<br>
+ vabdl.u8 q2, d0, d2<br>
+ vabdl.u8 q3, d1, d3<br>
+ vabdl.u8 q10, d28, d2<br>
+ vabdl.u8 q11, d28, d3<br>
+.else<br>
<div class="im">+ vabal.u8 q2, d0, d2<br>
+ vabal.u8 q3, d1, d3<br>
</div>+ vabal.u8 q10, d28, d2<br>
+ vabal.u8 q11, d28, d3<br>
+.endif<br>
+ vaddw.u8 q8, d28<br>
+.set Y, -1<br>
+.endr<br>
+ vmov.i8 d17, #0<br>
<div class="im">+<br>
+ vadd.u16 d4, d6<br>
+ vadd.u16 d20, d22<br>
+ vaddl.u8 q0, d0, d1<br>
+<br>
+ vadd.u16 d5, d7<br>
+ vadd.u16 d21, d23<br>
+ vadd.u16 d0, d1<br>
+<br>
+ vadd.u16 d4, d5<br>
+ vadd.u16 d20, d21<br>
</div>+ vpadd.u16 d0, d17<br>
+<br>
+ vpadd.u16 d4, d17<br>
+ vpadd.u16 d20, d17<br>
+ vpadd.u16 d0, d17<br>
+<br>
+ vpadd.u16 d4, d20<br>
+ vadd.u16 d0, d16<br>
+ vst1.64 {d4}, [r2,:64]!<br>
<div class="im">+<br>
+ vrshr.u16 d0, #5<br>
+ sub r0, r0, r3, lsl #4<br>
+ vdup.8 d0, d0[0]<br>
+<br>
</div>+.set Y, 0<br>
<div class="im">+.rept 16<br>
+ vld1.8 {q1}, [r0], r3<br>
</div>+.if Y == 0<br>
+ vabdl.u8 q12, d0, d2<br>
+ vabdl.u8 q13, d0, d3<br>
+.else<br>
<div class="im">+ vabal.u8 q12, d0, d2<br>
+ vabal.u8 q13, d0, d3<br>
</div>+.endif<br>
+.set Y, -1<br>
+.endr<br>
<div class="im">+<br>
+ vadd.u16 d24, d26<br>
+ vadd.u16 d25, d27<br>
+ vadd.u16 d24, d25<br>
</div>+ vpadd.u16 d24, d17<br>
+ vpadd.u16 d24, d17<br>
+ vst1.32 {d24[0]}, [r2,:32]<br>
<div class="HOEnZb"><div class="h5">+<br>
+ pop {r4, pc}<br>
+.endfunc<br>
diff --git a/common/arm/pixel.h b/common/arm/pixel.h<br>
index 506cf59..f29ddb3 100644<br>
--- a/common/arm/pixel.h<br>
+++ b/common/arm/pixel.h<br>
@@ -72,4 +72,6 @@ float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );<br>
void x264_intra_sad_x3_4x4_armv6( uint8_t *, uint8_t *, int * );<br>
void x264_intra_sad_x3_8x8_neon( uint8_t *, uint8_t *, int * );<br>
void x264_intra_sad_x3_8x8c_neon( uint8_t *, uint8_t *, int * );<br>
+void x264_intra_sad_x3_16x16_neon( uint8_t *, uint8_t *, int * );<br>
+<br>
#endif<br>
diff --git a/common/pixel.c b/common/pixel.c<br>
index f6d6a04..d4ae1df 100644<br>
--- a/common/pixel.c<br>
+++ b/common/pixel.c<br>
@@ -1214,6 +1214,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )<br>
{<br>
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;<br>
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon;<br>
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon;<br>
INIT5( sad, _neon );<br>
INIT5( sad_aligned, _neon );<br>
INIT7( sad_x3, _neon );<br>
--<br>
1.7.4.1<br>
<br>
</div></div></blockquote></div><br>