<div dir="ltr">Down from 1205 to 868!<div>I would really like to know why I can't align the first store to anything.<br><br><div class="gmail_quote">On Thu, Feb 2, 2012 at 4:10 AM, George Stephanos <span dir="ltr"><<a href="mailto:gaf.stephanos@gmail.com">gaf.stephanos@gmail.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">---<br>
common/arm/pixel-a.S | 47 +++++++++++++++++++++++++++++++++++++++++++++++<br>
<div class="im"> common/arm/pixel.h | 1 +<br>
common/pixel.c | 1 +<br>
</div> 3 files changed, 49 insertions(+), 0 deletions(-)<br>
<br>
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S<br>
index da5f36c..995049b 100644<br>
--- a/common/arm/pixel-a.S<br>
+++ b/common/arm/pixel-a.S<br>
@@ -1289,3 +1289,50 @@ function x264_intra_sad_x3_4x4_armv6<br>
str r5, [r2, #8]<br>
pop {r4-r8,pc}<br>
.endfunc<br>
+<br>
+function x264_intra_sad_x3_8x8_neon<br>
<div class="im">+ add r1, #7<br>
+ vld1.8 {d4}, [r1]<br>
+ add r1, #9<br>
+ vrev64.8 d4, d4<br>
</div>+ vld1.8 {d0}, [r1]<br>
+<br>
+ mov r3, #FENC_STRIDE<br>
<div class="im">+<br>
+ vaddl.u8 q12, d0, d4<br>
+ vadd.u16 d24, d25<br>
</div>+ vmov.i8 q1, #0<br>
+ vpadd.u16 d24, d24<br>
+ vmov.i8 q3, #0<br>
+ vpadd.u16 d24, d24<br>
+ vmov.i8 q13, #0<br>
<div class="im">+ vrshr.u16 d24, #4<br>
+ vdup.8 d24, d24[0]<br>
+<br>
</div>+.irpc Y,0246<br>
+ vld1.8 {d16}, [r0], r3<br>
+ vld1.8 {d17}, [r0], r3<br>
<div class="im">+ vdup.8 d5, d4[\Y]<br>
+ vabal.u8 q1, d16, d0<br>
+ vabal.u8 q3, d16, d5<br>
+ vabal.u8 q13, d16, d24<br>
+ vdup.8 d5, d4[\Y+1]<br>
+ vabal.u8 q1, d17, d0<br>
+ vabal.u8 q3, d17, d5<br>
+ vabal.u8 q13, d17, d24<br>
</div>+.endr<br>
+ vmov.i8 d0, #0<br>
<div class="im">+<br>
+ vadd.u16 d2, d3<br>
</div>+ vadd.u16 d6, d7<br>
+ vadd.u16 d26, d27<br>
+ vpadd.u16 d2, d0<br>
+ vpadd.u16 d6, d0<br>
+ vpadd.u16 d26, d0<br>
+ vpadd.u16 d2, d6<br>
+ vpadd.u16 d26, d26<br>
+ vst1.64 {d2}, [r2]!<br>
+ vst1.32 {d26[0]}, [r2,:32]<br>
+<br>
+ bx lr<br>
+.endfunc<br>
<div class="HOEnZb"><div class="h5">diff --git a/common/arm/pixel.h b/common/arm/pixel.h<br>
index 3e02982..07a72c2 100644<br>
--- a/common/arm/pixel.h<br>
+++ b/common/arm/pixel.h<br>
@@ -70,4 +70,5 @@ void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int,<br>
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );<br>
<br>
void x264_intra_sad_x3_4x4_armv6( uint8_t *, uint8_t *, int * );<br>
+void x264_intra_sad_x3_8x8_neon( uint8_t *, uint8_t *, int * );<br>
#endif<br>
diff --git a/common/pixel.c b/common/pixel.c<br>
index 0949405..af7006f 100644<br>
--- a/common/pixel.c<br>
+++ b/common/pixel.c<br>
@@ -1212,6 +1212,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )<br>
}<br>
if( cpu&X264_CPU_NEON )<br>
{<br>
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;<br>
INIT5( sad, _neon );<br>
INIT5( sad_aligned, _neon );<br>
INIT7( sad_x3, _neon );<br>
--<br>
1.7.4.1<br>
<br>
</div></div></blockquote></div><br></div></div>