<div dir="ltr"><br><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername"></b> <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span><br>

Date: Tue, Oct 1, 2013 at 6:38 PM<br>Subject: [x265] [PATCH] Replace sad_12, sad_24, sad_32 vector class functions with intrinsics<br>To: <a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<br><br># HG changeset patch<br>

# User Dnyaneshwar<br>
# Date 1380631342 -19800<br>
#      Tue Oct 01 18:12:22 2013 +0530<br>
# Node ID ecc483a16f1d9e0163182d090c18fad3f1616ab5<br>
# Parent  a03659cfa9574a2639292e427b2cb3d080c648ad<br>
Replace sad_12, sad_24, sad_32 vector class functions with intrinsics.<br>
<br>
Performance improvement measured is close to 1.5x<br>
<br>
diff -r a03659cfa957 -r ecc483a16f1d source/common/vec/pixel8.inc<br>
--- a/source/common/vec/pixel8.inc      Mon Sep 30 21:26:49 2013 -0500<br>
+++ b/source/common/vec/pixel8.inc      Tue Oct 01 18:12:22 2013 +0530<br>
@@ -840,51 +840,324 @@<br>
 }<br>
<br>
 #endif /* if HAVE_MMX */<br>
+<br>
+template<int ly><br>
+int sad_12(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride)<br>
+{<br>
+    assert((ly % 4) == 0);<br>+    __m128i sum0 = _mm_setzero_si128();<br>+    __m128i sum1 = _mm_setzero_si128();<br>>>+    __m128i T00, T01, T02, T03;<br>>>+    __m128i T10, T11, T12, T13;<br>>>+    __m128i T20, T21, T22, T23;<br>

>>+</div><div class="gmail_quote">I think we can move above declaration to local blocks.</div><div class="gmail_quote"><br>>>+    __m128i mask;<br>>>+    mask = _mm_set_epi32(0x0, 0xffffffff, 0xffffffff, 0xffffffff);</div>

<div class="gmail_quote"><br></div><div class="gmail_quote">This syntax seems more obvious  to me when I need to declare and Initialize at a time and have the same effect.</div><div class="gmail_quote">__m128i mask = _mm_set_epi32(0x0, 0xffffffff, 0xffffffff, 0xffffffff);<br>

</div><div class="gmail_quote"><br>
+<br>
+    if (ly == 4)<br>
+    {</div><div class="gmail_quote"><br></div><div class="gmail_quote">Here you would like to declare your temporary registers, like: (applied to all blocks including loops)</div><div class="gmail_quote"><br></div><div class="gmail_quote">

 __m128i T00, T01, T02, T03;<br> __m128i T10, T11, T12, T13;<br> __m128i T20, T21, T22, T23;<br></div><div class="gmail_quote"><br>
+        T00 = _mm_load_si128((__m128i*)(fenc));<br>
+        T00 = _mm_and_si128(T00, mask);<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride));<br>
+        T01 = _mm_and_si128(T01, mask);<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));<br>
+        T02 = _mm_and_si128(T02, mask);<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));<br>
+        T03 = _mm_and_si128(T03, mask);<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref));<br>
+        T10 = _mm_and_si128(T10, mask);<br>
+        T11 = _mm_load_si128((__m128i*)(fref + frefstride));<br>
+        T11 = _mm_and_si128(T11, mask);<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (2) * frefstride));<br>
+        T12 = _mm_and_si128(T12, mask);<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (3) * frefstride));<br>
+        T13 = _mm_and_si128(T13, mask);<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>>>+        sum0 = _mm_add_epi16(sum0, T20);<br>>>+        sum0 = _mm_add_epi16(sum0, T21);<br>>>+        sum0 = _mm_add_epi16(sum0, T22);<br>>>+        sum0 = _mm_add_epi16(sum0, T23);</div><div class="gmail_quote">

<br></div><div class="gmail_quote">         you can replace with following,</div><div class="gmail_quote"><br></div><div class="gmail_quote">        T20 = _mm_add_epi16(T20, T21);<br>        T22 = _mm_add_epi16(T22, T23);<br>

<div class="gmail_quote">        sum0 = _mm_add_epi16(T20, T22); </div></div><div class="gmail_quote">
+    }<br>
+    else if (ly == 8)<br>
+    {</div><div class="gmail_quote">
+        T00 = _mm_load_si128((__m128i*)(fenc));<br>
+        T00 = _mm_and_si128(T00, mask);<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride));<br>
+        T01 = _mm_and_si128(T01, mask);<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));<br>
+        T02 = _mm_and_si128(T02, mask);<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));<br>
+        T03 = _mm_and_si128(T03, mask);<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref));<br>
+        T10 = _mm_and_si128(T10, mask);<br>
+        T11 = _mm_load_si128((__m128i*)(fref + frefstride));<br>
+        T11 = _mm_and_si128(T11, mask);<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (2) * frefstride));<br>
+        T12 = _mm_and_si128(T12, mask);<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (3) * frefstride));<br>
+        T13 = _mm_and_si128(T13, mask);<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>>>+        sum0 = _mm_add_epi16(sum0, T20);<br>>>+        sum0 = _mm_add_epi16(sum0, T21);<br>>>+        sum0 = _mm_add_epi16(sum0, T22);<br>>>+        sum0 = _mm_add_epi16(sum0, T23);</div><div class="gmail_quote">

<br></div><div class="gmail_quote">can be replaced as above: (apply wherever logically fits)</div><div class="gmail_quote"><br></div><div class="gmail_quote">        T20 = _mm_add_epi16(T20, T21);<br>        T22 = _mm_add_epi16(T22, T23);<br>

<div class="gmail_quote">        sum0 = _mm_add_epi16(T20, T22); </div></div><div class="gmail_quote"><br></div><div class="gmail_quote">
+        T00 = _mm_load_si128((__m128i*)(fenc + (4) * fencstride));<br>
+        T00 = _mm_and_si128(T00, mask);<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (5) * fencstride));<br>
+        T01 = _mm_and_si128(T01, mask);<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (6) * fencstride));<br>
+        T02 = _mm_and_si128(T02, mask);<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (7) * fencstride));<br>
+        T03 = _mm_and_si128(T03, mask);<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref + (4) * frefstride));<br>
+        T10 = _mm_and_si128(T10, mask);<br>
+        T11 = _mm_load_si128((__m128i*)(fref + (5) * frefstride));<br>
+        T11 = _mm_and_si128(T11, mask);<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (6) * frefstride));<br>
+        T12 = _mm_and_si128(T12, mask);<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (7) * frefstride));<br>
+        T13 = _mm_and_si128(T13, mask);<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+    }<br>
+    else if (ly == 16)<br>
+    {<br>
+        T00 = _mm_load_si128((__m128i*)(fenc));<br>
+        T00 = _mm_and_si128(T00, mask);<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride));<br>
+        T01 = _mm_and_si128(T01, mask);<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));<br>
+        T02 = _mm_and_si128(T02, mask);<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));<br>
+        T03 = _mm_and_si128(T03, mask);<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref));<br>
+        T10 = _mm_and_si128(T10, mask);<br>
+        T11 = _mm_load_si128((__m128i*)(fref + frefstride));<br>
+        T11 = _mm_and_si128(T11, mask);<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (2) * frefstride));<br>
+        T12 = _mm_and_si128(T12, mask);<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (3) * frefstride));<br>
+        T13 = _mm_and_si128(T13, mask);<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (4) * fencstride));<br>
+        T00 = _mm_and_si128(T00, mask);<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (5) * fencstride));<br>
+        T01 = _mm_and_si128(T01, mask);<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (6) * fencstride));<br>
+        T02 = _mm_and_si128(T02, mask);<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (7) * fencstride));<br>
+        T03 = _mm_and_si128(T03, mask);<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref + (4) * frefstride));<br>
+        T10 = _mm_and_si128(T10, mask);<br>
+        T11 = _mm_load_si128((__m128i*)(fref + (5) * frefstride));<br>
+        T11 = _mm_and_si128(T11, mask);<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (6) * frefstride));<br>
+        T12 = _mm_and_si128(T12, mask);<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (7) * frefstride));<br>
+        T13 = _mm_and_si128(T13, mask);<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (8) * fencstride));<br>
+        T00 = _mm_and_si128(T00, mask);<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (9) * fencstride));<br>
+        T01 = _mm_and_si128(T01, mask);<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (10) * fencstride));<br>
+        T02 = _mm_and_si128(T02, mask);<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (11) * fencstride));<br>
+        T03 = _mm_and_si128(T03, mask);<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref + (8) * frefstride));<br>
+        T10 = _mm_and_si128(T10, mask);<br>
+        T11 = _mm_load_si128((__m128i*)(fref + (9) * frefstride));<br>
+        T11 = _mm_and_si128(T11, mask);<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (10) * frefstride));<br>
+        T12 = _mm_and_si128(T12, mask);<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (11) * frefstride));<br>
+        T13 = _mm_and_si128(T13, mask);<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (12) * fencstride));<br>
+        T00 = _mm_and_si128(T00, mask);<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (13) * fencstride));<br>
+        T01 = _mm_and_si128(T01, mask);<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (14) * fencstride));<br>
+        T02 = _mm_and_si128(T02, mask);<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (15) * fencstride));<br>
+        T03 = _mm_and_si128(T03, mask);<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref + (12) * frefstride));<br>
+        T10 = _mm_and_si128(T10, mask);<br>
+        T11 = _mm_load_si128((__m128i*)(fref + (13) * frefstride));<br>
+        T11 = _mm_and_si128(T11, mask);<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (14) * frefstride));<br>
+        T12 = _mm_and_si128(T12, mask);<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (15) * frefstride));<br>
+        T13 = _mm_and_si128(T13, mask);<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+    }<br>
+    else if ((ly % 8) == 0)<br>
+    {<br>
+        for (int i = 0; i < ly; i += 8)<br>
+        {<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + (i) * fencstride));<br>
+            T00 = _mm_and_si128(T00, mask);<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));<br>
+            T01 = _mm_and_si128(T01, mask);<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));<br>
+            T02 = _mm_and_si128(T02, mask);<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));<br>
+            T03 = _mm_and_si128(T03, mask);<br>
+<br>
+            T10 = _mm_load_si128((__m128i*)(fref + (i) * frefstride));<br>
+            T10 = _mm_and_si128(T10, mask);<br>
+            T11 = _mm_load_si128((__m128i*)(fref + (i + 1) * frefstride));<br>
+            T11 = _mm_and_si128(T11, mask);<br>
+            T12 = _mm_load_si128((__m128i*)(fref + (i + 2) * frefstride));<br>
+            T12 = _mm_and_si128(T12, mask);<br>
+            T13 = _mm_load_si128((__m128i*)(fref + (i + 3) * frefstride));<br>
+            T13 = _mm_and_si128(T13, mask);<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi16(sum0, T20);<br>
+            sum0 = _mm_add_epi16(sum0, T21);<br>
+            sum0 = _mm_add_epi16(sum0, T22);<br>
+            sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + (i + 4) * fencstride));<br>
+            T00 = _mm_and_si128(T00, mask);<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + (i + 5) * fencstride));<br>
+            T01 = _mm_and_si128(T01, mask);<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + (i + 6) * fencstride));<br>
+            T02 = _mm_and_si128(T02, mask);<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + (i + 7) * fencstride));<br>
+            T03 = _mm_and_si128(T03, mask);<br>
+<br>
+            T10 = _mm_load_si128((__m128i*)(fref + (i + 4) * frefstride));<br>
+            T10 = _mm_and_si128(T10, mask);<br>
+            T11 = _mm_load_si128((__m128i*)(fref + (i + 5) * frefstride));<br>
+            T11 = _mm_and_si128(T11, mask);<br>
+            T12 = _mm_load_si128((__m128i*)(fref + (i + 6) * frefstride));<br>
+            T12 = _mm_and_si128(T12, mask);<br>
+            T13 = _mm_load_si128((__m128i*)(fref + (i + 7) * frefstride));<br>
+            T13 = _mm_and_si128(T13, mask);<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi16(sum0, T20);<br>
+            sum0 = _mm_add_epi16(sum0, T21);<br>
+            sum0 = _mm_add_epi16(sum0, T22);<br>
+            sum0 = _mm_add_epi16(sum0, T23);<br>
+        }<br>
+    }<br>
+    else<br>
+    {<br>
+        for (int i = 0; i < ly; i += 4)<br>
+        {<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + (i) * fencstride));<br>
+            T00 = _mm_and_si128(T00, mask);<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));<br>
+            T01 = _mm_and_si128(T01, mask);<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));<br>
+            T02 = _mm_and_si128(T02, mask);<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));<br>
+            T03 = _mm_and_si128(T03, mask);<br>
+<br>
+            T10 = _mm_load_si128((__m128i*)(fref + (i) * frefstride));<br>
+            T10 = _mm_and_si128(T10, mask);<br>
+            T11 = _mm_load_si128((__m128i*)(fref + (i + 1) * frefstride));<br>
+            T11 = _mm_and_si128(T11, mask);<br>
+            T12 = _mm_load_si128((__m128i*)(fref + (i + 2) * frefstride));<br>
+            T12 = _mm_and_si128(T12, mask);<br>
+            T13 = _mm_load_si128((__m128i*)(fref + (i + 3) * frefstride));<br>
+            T13 = _mm_and_si128(T13, mask);<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi16(sum0, T20);<br>
+            sum0 = _mm_add_epi16(sum0, T21);<br>
+            sum0 = _mm_add_epi16(sum0, T22);<br>
+            sum0 = _mm_add_epi16(sum0, T23);<br>
+        }<br>
+    }<br>
+    sum1 = _mm_shuffle_epi32(sum0, 2);<br>
+    sum0 = _mm_add_epi32(sum0, sum1);<br>
+<br>
+    return _mm_cvtsi128_si32(sum0);<br>
+}<br>
+<br>>> #endif /* SSE41 */</div><div class="gmail_quote"><br></div><div class="gmail_quote">You can guard all functions with single if INSTRSET >= X265_CPU_LEVEL_SSE41 macro, move it at end and you have forgot to remove template meta programming code used by old vector code, I mean stuffs like this:</div>
<div class="gmail_quote"><div class="gmail_quote"><br></div><div class="gmail_quote">template<int size></div><div class="gmail_quote">ALWAYSINLINE void unrollFunc_64(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride, Vec8us& sad)</div>
<div class="gmail_quote">{</div><div class="gmail_quote">    unrollFunc_64<1>(fenc, fencstride, fref, frefstride, sad);</div><div class="gmail_quote">    unrollFunc_64<size - 1>(fenc + fencstride, fencstride, fref + frefstride, frefstride, sad);</div>
<div class="gmail_quote">}</div></div><div class="gmail_quote"><br></div><div class="gmail_quote">Please, remove for all sizes which you have converted to intrinsics.</div><div class="gmail_quote">
<br>
-template<int ly><br>
-int sad_12(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)<br>
-{<br>
-    Vec16uc m1, n1;<br>
-<br>
-    Vec4i sum(0);<br>
-    Vec8us sad(0);<br>
-    int max_iterators = (ly >> 4) << 4;<br>
-    int row;<br>
-<br>
-    for (row = 0; row < max_iterators; row += 16)<br>
-    {<br>
-        for (int i = 0; i < 16; i++)<br>
-        {<br>
-            m1.load_a(fenc);<br>
-            m1.cutoff(12);<br>
-            n1.load(fref);<br>
-            n1.cutoff(12);<br>
-            sad.addSumAbsDiff(m1, n1);<br>
-<br>
-            fenc += fencstride;<br>
-            fref += frefstride;<br>
-        }<br>
-<br>
-        sum += extend_low(sad) + extend_high(sad);<br>
-        sad = 0;<br>
-    }<br>
-<br>
-    while (row++ < ly)<br>
-    {<br>
-        m1.load_a(fenc);<br>
-        m1.cutoff(12);<br>
-        n1.load(fref);<br>
-        n1.cutoff(12);<br>
-        sad.addSumAbsDiff(m1, n1);<br>
-<br>
-        fenc += fencstride;<br>
-        fref += frefstride;<br>
-    }<br>
-<br>
-    sum += extend_low(sad) + extend_high(sad);<br>
-    return horizontal_add(sum);<br>
-}<br>
<br>
 #if INSTRSET >= X265_CPU_LEVEL_SSE41<br>
 template<int ly><br>
@@ -1123,56 +1396,486 @@<br>
<br>
 #endif /* if INSTRSET >= X265_CPU_LEVEL_SSE41 */<br>
<br>
-template<int ly><br>
-int sad_24(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride)<br>
-{<br>
-    Vec16uc m1, n1;<br>
-<br>
-    Vec4i sum(0);<br>
-    Vec8us sad(0);<br>
-    int max_iterators = (ly >> 4) << 4;<br>
-    int row;<br>
-<br>
-    for (row = 0; row < max_iterators; row += 16)<br>
-    {<br>
-        for (int i = 0; i < 16; i++)<br>
-        {<br>
-            m1.load_a(fenc);<br>
-            n1.load(fref);<br>
-            sad.addSumAbsDiff(m1, n1);<br>
-<br>
-            m1.load_a(fenc + 16);<br>
-            m1.cutoff(8);<br>
-            n1.load(fref + 16);<br>
-            n1.cutoff(8);<br>
-            sad.addSumAbsDiff(m1, n1);<br>
-<br>
-            fenc += fencstride;<br>
-            fref += frefstride;<br>
-        }<br>
-<br>
-        sum += extend_low(sad) + extend_high(sad);<br>
-        sad = 0;<br>
-    }<br>
-<br>
-    while (row++ < ly)<br>
-    {<br>
-        m1.load_a(fenc);<br>
-        n1.load(fref);<br>
-        sad.addSumAbsDiff(m1, n1);<br>
-<br>
-        m1.load_a(fenc + 16);<br>
-        m1.cutoff(8);<br>
-        n1.load(fref + 16);<br>
-        n1.cutoff(8);<br>
-        sad.addSumAbsDiff(m1, n1);<br>
-<br>
-        fenc += fencstride;<br>
-        fref += frefstride;<br>
-    }<br>
-<br>
-    sum += extend_low(sad) + extend_high(sad);<br>
-    return horizontal_add(sum);<br>
+template<int ly><br>
+int sad_24(pixel *fenc, intptr_t fencstride, pixel *fref, intptr_t frefstride)<br>
+{<br>
+    assert((ly % 4) == 0);<br>
+    __m128i sum0 = _mm_setzero_si128();<br>
+    __m128i sum1 = _mm_setzero_si128();<br>
+    __m128i T00, T01, T02, T03;<br>
+    __m128i T10, T11, T12, T13;<br>
+    __m128i T20, T21, T22, T23;<br>
+<br>
+    if (ly == 4)<br>
+    {<br>
+        T00 = _mm_load_si128((__m128i*)(fenc));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref));<br>
+        T11 = _mm_load_si128((__m128i*)(fref + frefstride));<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (2) * frefstride));<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (3) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_loadl_epi64((__m128i*)(fenc + 16));<br>
+        T01 = _mm_loadl_epi64((__m128i*)(fenc + (fencstride + 16)));<br>
+        T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+        T02 = _mm_loadl_epi64((__m128i*)(fenc + ((2) * fencstride) + 16));<br>
+        T03 = _mm_loadl_epi64((__m128i*)(fenc + ((3) * fencstride) + 16));<br>
+        T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+        T10 = _mm_loadl_epi64((__m128i*)(fref + ((0) * frefstride) + 16));<br>
+        T11 = _mm_loadl_epi64((__m128i*)(fref + ((1) * frefstride) + 16));<br>
+        T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+        T12 = _mm_loadl_epi64((__m128i*)(fref + ((2) * frefstride) + 16));<br>
+        T13 = _mm_loadl_epi64((__m128i*)(fref + ((3) * frefstride) + 16));<br>
+        T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+        T20 = _mm_setzero_si128();<br>
+        T21 = _mm_setzero_si128();<br>
+<br>
+        T20 = _mm_sad_epu8(T01, T11);<br>
+        T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+    }<br>
+    else if (ly == 8)<br>
+    {<br>
+        T00 = _mm_load_si128((__m128i*)(fenc));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref));<br>
+        T11 = _mm_load_si128((__m128i*)(fref + frefstride));<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (2) * frefstride));<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (3) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_loadl_epi64((__m128i*)(fenc + 16));<br>
+        T01 = _mm_loadl_epi64((__m128i*)(fenc + (fencstride + 16)));<br>
+        T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+        T02 = _mm_loadl_epi64((__m128i*)(fenc + ((2) * fencstride) + 16));<br>
+        T03 = _mm_loadl_epi64((__m128i*)(fenc + ((3) * fencstride) + 16));<br>
+        T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+        T10 = _mm_loadl_epi64((__m128i*)(fref + ((0) * frefstride) + 16));<br>
+        T11 = _mm_loadl_epi64((__m128i*)(fref + ((1) * frefstride) + 16));<br>
+        T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+        T12 = _mm_loadl_epi64((__m128i*)(fref + ((2) * frefstride) + 16));<br>
+        T13 = _mm_loadl_epi64((__m128i*)(fref + ((3) * frefstride) + 16));<br>
+        T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+        T20 = _mm_setzero_si128();<br>
+        T21 = _mm_setzero_si128();<br>
+<br>
+        T20 = _mm_sad_epu8(T01, T11);<br>
+        T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (4) * fencstride));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (5) * fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (6) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (7) * fencstride));<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref + (4) * frefstride));<br>
+        T11 = _mm_load_si128((__m128i*)(fref + (5) * frefstride));<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (6) * frefstride));<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (7) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_loadl_epi64((__m128i*)(fenc + ((4) * fencstride) + 16));<br>
+        T01 = _mm_loadl_epi64((__m128i*)(fenc + ((5) * fencstride) + 16));<br>
+        T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+        T02 = _mm_loadl_epi64((__m128i*)(fenc + ((6) * fencstride) + 16));<br>
+        T03 = _mm_loadl_epi64((__m128i*)(fenc + ((7) * fencstride) + 16));<br>
+        T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+        T10 = _mm_loadl_epi64((__m128i*)(fref + ((4) * frefstride) + 16));<br>
+        T11 = _mm_loadl_epi64((__m128i*)(fref + ((5) * frefstride) + 16));<br>
+        T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+        T12 = _mm_loadl_epi64((__m128i*)(fref + ((6) * frefstride) + 16));<br>
+        T13 = _mm_loadl_epi64((__m128i*)(fref + ((7) * frefstride) + 16));<br>
+        T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+        T20 = _mm_setzero_si128();<br>
+        T21 = _mm_setzero_si128();<br>
+<br>
+        T20 = _mm_sad_epu8(T01, T11);<br>
+        T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+    }<br>
+    else if (ly == 16)<br>
+    {<br>
+        T00 = _mm_load_si128((__m128i*)(fenc));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref));<br>
+        T11 = _mm_load_si128((__m128i*)(fref + frefstride));<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (2) * frefstride));<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (3) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_loadl_epi64((__m128i*)(fenc + 16));<br>
+        T01 = _mm_loadl_epi64((__m128i*)(fenc + fencstride + 16));<br>
+        T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+        T02 = _mm_loadl_epi64((__m128i*)(fenc + ((2) * fencstride) + 16));<br>
+        T03 = _mm_loadl_epi64((__m128i*)(fenc + ((3) * fencstride) + 16));<br>
+        T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+        T10 = _mm_loadl_epi64((__m128i*)(fref + 16));<br>
+        T11 = _mm_loadl_epi64((__m128i*)(fref + (frefstride + 16)));<br>
+        T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+        T12 = _mm_loadl_epi64((__m128i*)(fref + ((2) * frefstride) + 16));<br>
+        T13 = _mm_loadl_epi64((__m128i*)(fref + ((3) * frefstride) + 16));<br>
+        T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+        T20 = _mm_setzero_si128();<br>
+        T21 = _mm_setzero_si128();<br>
+<br>
+        T20 = _mm_sad_epu8(T01, T11);<br>
+        T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (4) * fencstride));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (5) * fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (6) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (7) * fencstride));<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref + (4) * frefstride));<br>
+        T11 = _mm_load_si128((__m128i*)(fref + (5) * frefstride));<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (6) * frefstride));<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (7) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_loadl_epi64((__m128i*)(fenc + ((4) * fencstride) + 16));<br>
+        T01 = _mm_loadl_epi64((__m128i*)(fenc + ((5) * fencstride) + 16));<br>
+        T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+        T02 = _mm_loadl_epi64((__m128i*)(fenc + ((6) * fencstride) + 16));<br>
+        T03 = _mm_loadl_epi64((__m128i*)(fenc + ((7) * fencstride) + 16));<br>
+        T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+        T10 = _mm_loadl_epi64((__m128i*)(fref + ((4) * frefstride) + 16));<br>
+        T11 = _mm_loadl_epi64((__m128i*)(fref + ((5) * frefstride) + 16));<br>
+        T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+        T12 = _mm_loadl_epi64((__m128i*)(fref + ((6) * frefstride) + 16));<br>
+        T13 = _mm_loadl_epi64((__m128i*)(fref + ((7) * frefstride) + 16));<br>
+        T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+        T20 = _mm_setzero_si128();<br>
+        T21 = _mm_setzero_si128();<br>
+<br>
+        T20 = _mm_sad_epu8(T01, T11);<br>
+        T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (8) * fencstride));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (9) * fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (10) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (11) * fencstride));<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref + (8) * frefstride));<br>
+        T11 = _mm_load_si128((__m128i*)(fref + (9) * frefstride));<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (10) * frefstride));<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (11) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_loadl_epi64((__m128i*)(fenc + ((8) * fencstride) + 16));<br>
+        T01 = _mm_loadl_epi64((__m128i*)(fenc + ((9) * fencstride) + 16));<br>
+        T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+        T02 = _mm_loadl_epi64((__m128i*)(fenc + ((10) * fencstride) + 16));<br>
+        T03 = _mm_loadl_epi64((__m128i*)(fenc + ((11) * fencstride) + 16));<br>
+        T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+        T10 = _mm_loadl_epi64((__m128i*)(fref + ((8) * frefstride) + 16));<br>
+        T11 = _mm_loadl_epi64((__m128i*)(fref + ((9) * frefstride) + 16));<br>
+        T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+        T12 = _mm_loadl_epi64((__m128i*)(fref + ((10) * frefstride) + 16));<br>
+        T13 = _mm_loadl_epi64((__m128i*)(fref + ((11) * frefstride) + 16));<br>
+        T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+        T20 = _mm_setzero_si128();<br>
+        T21 = _mm_setzero_si128();<br>
+<br>
+        T20 = _mm_sad_epu8(T01, T11);<br>
+        T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (12) * fencstride));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (13) * fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (14) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (15) * fencstride));<br>
+<br>
+        T10 = _mm_load_si128((__m128i*)(fref + (12) * frefstride));<br>
+        T11 = _mm_load_si128((__m128i*)(fref + (13) * frefstride));<br>
+        T12 = _mm_load_si128((__m128i*)(fref + (14) * frefstride));<br>
+        T13 = _mm_load_si128((__m128i*)(fref + (15) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+        T00 = _mm_loadl_epi64((__m128i*)(fenc + ((12) * fencstride) + 16));<br>
+        T01 = _mm_loadl_epi64((__m128i*)(fenc + ((13) * fencstride) + 16));<br>
+        T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+        T02 = _mm_loadl_epi64((__m128i*)(fenc + ((14) * fencstride) + 16));<br>
+        T03 = _mm_loadl_epi64((__m128i*)(fenc + ((15) * fencstride) + 16));<br>
+        T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+        T10 = _mm_loadl_epi64((__m128i*)(fref + ((12) * frefstride) + 16));<br>
+        T11 = _mm_loadl_epi64((__m128i*)(fref + ((13) * frefstride) + 16));<br>
+        T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+        T12 = _mm_loadl_epi64((__m128i*)(fref + ((14) * frefstride) + 16));<br>
+        T13 = _mm_loadl_epi64((__m128i*)(fref + ((15) * frefstride) + 16));<br>
+        T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+        T20 = _mm_setzero_si128();<br>
+        T21 = _mm_setzero_si128();<br>
+<br>
+        T20 = _mm_sad_epu8(T01, T11);<br>
+        T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+    }<br>
+    else if ((ly % 8) == 0)<br>
+    {<br>
+        for (int i = 0; i < ly; i += 8)<br>
+        {<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + (i) * fencstride));<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));<br>
+<br>
+            T10 = _mm_load_si128((__m128i*)(fref + (i) * frefstride));<br>
+            T11 = _mm_load_si128((__m128i*)(fref + (i + 1) * frefstride));<br>
+            T12 = _mm_load_si128((__m128i*)(fref + (i + 2) * frefstride));<br>
+            T13 = _mm_load_si128((__m128i*)(fref + (i + 3) * frefstride));<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+            sum0 = _mm_add_epi32(sum0, T22);<br>
+            sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+            T00 = _mm_loadl_epi64((__m128i*)(fenc + ((i) * fencstride) + 16));<br>
+            T01 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 1) * fencstride) + 16));<br>
+            T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+            T02 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 2) * fencstride) + 16));<br>
+            T03 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 3) * fencstride) + 16));<br>
+            T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+            T10 = _mm_loadl_epi64((__m128i*)(fref + ((i) * frefstride) + 16));<br>
+            T11 = _mm_loadl_epi64((__m128i*)(fref + ((i + 1) * frefstride) + 16));<br>
+            T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+            T12 = _mm_loadl_epi64((__m128i*)(fref + ((i + 2) * frefstride) + 16));<br>
+            T13 = _mm_loadl_epi64((__m128i*)(fref + ((i + 3) * frefstride) + 16));<br>
+            T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+            T20 = _mm_setzero_si128();<br>
+            T21 = _mm_setzero_si128();<br>
+<br>
+            T20 = _mm_sad_epu8(T01, T11);<br>
+            T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + (i + 4) * fencstride));<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + (i + 5) * fencstride));<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + (i + 6) * fencstride));<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + (i + 7) * fencstride));<br>
+<br>
+            T10 = _mm_load_si128((__m128i*)(fref + (i + 4) * frefstride));<br>
+            T11 = _mm_load_si128((__m128i*)(fref + (i + 5) * frefstride));<br>
+            T12 = _mm_load_si128((__m128i*)(fref + (i + 6) * frefstride));<br>
+            T13 = _mm_load_si128((__m128i*)(fref + (i + 7) * frefstride));<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+            sum0 = _mm_add_epi32(sum0, T22);<br>
+            sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+            T00 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 4) * fencstride) + 16));<br>
+            T01 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 5) * fencstride) + 16));<br>
+            T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+            T02 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 6) * fencstride) + 16));<br>
+            T03 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 7) * fencstride) + 16));<br>
+            T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+            T10 = _mm_loadl_epi64((__m128i*)(fref + ((i + 4) * frefstride) + 16));<br>
+            T11 = _mm_loadl_epi64((__m128i*)(fref + ((i + 5) * frefstride) + 16));<br>
+            T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+            T12 = _mm_loadl_epi64((__m128i*)(fref + ((i + 6) * frefstride) + 16));<br>
+            T13 = _mm_loadl_epi64((__m128i*)(fref + ((i + 7) * frefstride) + 16));<br>
+            T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+            T20 = _mm_setzero_si128();<br>
+            T21 = _mm_setzero_si128();<br>
+<br>
+            T20 = _mm_sad_epu8(T01, T11);<br>
+            T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+        }<br>
+    }<br>
+    else<br>
+    {<br>
+        for (int i = 0; i < ly; i += 4)<br>
+        {<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + (i) * fencstride));<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));<br>
+<br>
+            T10 = _mm_load_si128((__m128i*)(fref + (i) * frefstride));<br>
+            T11 = _mm_load_si128((__m128i*)(fref + (i + 1) * frefstride));<br>
+            T12 = _mm_load_si128((__m128i*)(fref + (i + 2) * frefstride));<br>
+            T13 = _mm_load_si128((__m128i*)(fref + (i + 3) * frefstride));<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi16(sum0, T20);<br>
+            sum0 = _mm_add_epi16(sum0, T21);<br>
+            sum0 = _mm_add_epi16(sum0, T22);<br>
+            sum0 = _mm_add_epi16(sum0, T23);<br>
+<br>
+            T00 = _mm_loadl_epi64((__m128i*)(fenc + ((i) * fencstride) + 16));<br>
+            T01 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 1) * fencstride) + 16));<br>
+            T01 = _mm_unpacklo_epi64(T00, T01);<br>
+<br>
+            T02 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 2) * fencstride) + 16));<br>
+            T03 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 3) * fencstride) + 16));<br>
+            T03 = _mm_unpacklo_epi64(T02, T03);<br>
+<br>
+            T10 = _mm_loadl_epi64((__m128i*)(fref + ((i) * frefstride) + 16));<br>
+            T11 = _mm_loadl_epi64((__m128i*)(fref + ((i + 1) * frefstride) + 16));<br>
+            T11 = _mm_unpacklo_epi64(T10, T11);<br>
+<br>
+            T12 = _mm_loadl_epi64((__m128i*)(fref + ((i + 2) * frefstride) + 16));<br>
+            T13 = _mm_loadl_epi64((__m128i*)(fref + ((i + 3) * frefstride) + 16));<br>
+            T13 = _mm_unpacklo_epi64(T12, T13);<br>
+<br>
+            T20 = _mm_setzero_si128();<br>
+            T21 = _mm_setzero_si128();<br>
+<br>
+            T20 = _mm_sad_epu8(T01, T11);<br>
+            T21 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+        }<br>
+    }<br>
+    sum1 = _mm_shuffle_epi32(sum0, 2);<br>
+    sum0 = _mm_add_epi32(sum0, sum1);<br>
+<br>
+    return _mm_cvtsi128_si32(sum0);<br>
 }<br>
<br>
 template<int size><br>
@@ -1196,30 +1899,437 @@<br>
     sad.addSumAbsDiff(m1, n1);<br>
 }<br>
<br>
-template<int ly><br>
-int sad_32(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)<br>
-{<br>
-    Vec4i sum(0);<br>
-    Vec8us sad;<br>
-    int max_iterators = (ly >> 2) << 2;<br>
-    int row;<br>
-    if (ly == 4)<br>
-    {<br>
-        sad = 0;<br>
-        unrollFunc_32<4>(fenc, fencstride, fref, frefstride, sad);<br>
-        sum += extend_low(sad) + extend_high(sad);<br>
-        return horizontal_add(sum);<br>
-    }<br>
-    for (row = 0; row < max_iterators; row += 4)<br>
-    {<br>
-        sad = 0;<br>
-        unrollFunc_32<4>(fenc, fencstride, fref, frefstride, sad);<br>
-        sum += extend_low(sad) + extend_high(sad);<br>
-        fenc += fencstride * 4;<br>
-        fref += frefstride * 4;<br>
-    }<br>
-<br>
-    return horizontal_add(sum);<br>
+template<int ly><br>
+int sad_32(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)<br>
+{<br>
+    assert((ly % 4) == 0);<br>
+<br>
+    __m128i sum0 = _mm_setzero_si128();<br>
+    __m128i sum1 = _mm_setzero_si128();<br>
+    __m128i T00, T01, T02, T03;<br>
+    __m128i T10, T11, T12, T13;<br>
+    __m128i T20, T21, T22, T23;<br>
+<br>
+    if (ly == 4)<br>
+    {<br>
+        T00 = _mm_load_si128((__m128i*)(fenc));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + frefstride));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + (2) * frefstride));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + (3) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + 16));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride + 16));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + ((2) * fencstride) + 16));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + ((3) * fencstride) + 16));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + 16));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + frefstride + 16));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + ((2) * frefstride) + 16));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + ((3) * frefstride) + 16));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+    }<br>
+    else if (ly == 8)<br>
+    {<br>
+        T00 = _mm_load_si128((__m128i*)(fenc));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + frefstride));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + (2) * frefstride));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + (3) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + 16));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride + 16));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + ((2) * fencstride) + 16));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + ((3) * fencstride) + 16));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + 16));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + frefstride + 16));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + ((2) * frefstride) + 16));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + ((3) * frefstride) + 16));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (4) * fencstride));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (5) * fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (6) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (7) * fencstride));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + (4) * frefstride));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + (5) * frefstride));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + (6) * frefstride));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + (7) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + ((4) * fencstride) + 16));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + ((5) * fencstride) + 16));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + ((6) * fencstride) + 16));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + ((7) * fencstride) + 16));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + ((4) * frefstride) + 16));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + ((5) * frefstride) + 16));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + ((6) * frefstride) + 16));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + ((7) * frefstride) + 16));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+    }<br>
+    else if (ly == 16)<br>
+    {<br>
+        T00 = _mm_load_si128((__m128i*)(fenc));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + frefstride));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + (2) * frefstride));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + (3) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + 16));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + fencstride + 16));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + ((2) * fencstride) + 16));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + ((3) * fencstride) + 16));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + 16));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + frefstride + 16));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + ((2) * frefstride) + 16));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + ((3) * frefstride) + 16));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (4) * fencstride));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (5) * fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (6) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (7) * fencstride));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + (4) * frefstride));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + (5) * frefstride));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + (6) * frefstride));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + (7) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + ((4) * fencstride) + 16));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + ((5) * fencstride) + 16));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + ((6) * fencstride) + 16));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + ((7) * fencstride) + 16));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + ((4) * frefstride) + 16));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + ((5) * frefstride) + 16));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + ((6) * frefstride) + 16));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + ((7) * frefstride) + 16));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (8) * fencstride));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (9) * fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (10) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (11) * fencstride));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + (8) * frefstride));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + (9) * frefstride));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + (10) * frefstride));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + (11) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + ((8) * fencstride) + 16));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + ((9) * fencstride) + 16));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + ((10) * fencstride) + 16));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + ((11) * fencstride) + 16));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + ((8) * frefstride) + 16));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + ((9) * frefstride) + 16));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + ((10) * frefstride) + 16));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + ((11) * frefstride) + 16));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi16(sum0, T20);<br>
+        sum0 = _mm_add_epi16(sum0, T21);<br>
+        sum0 = _mm_add_epi16(sum0, T22);<br>
+        sum0 = _mm_add_epi16(sum0, T23);<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + (12) * fencstride));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + (13) * fencstride));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + (14) * fencstride));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + (15) * fencstride));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + (12) * frefstride));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + (13) * frefstride));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + (14) * frefstride));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + (15) * frefstride));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+        T00 = _mm_load_si128((__m128i*)(fenc + ((12) * fencstride) + 16));<br>
+        T01 = _mm_load_si128((__m128i*)(fenc + ((13) * fencstride) + 16));<br>
+        T02 = _mm_load_si128((__m128i*)(fenc + ((14) * fencstride) + 16));<br>
+        T03 = _mm_load_si128((__m128i*)(fenc + ((15) * fencstride) + 16));<br>
+<br>
+        T10 = _mm_loadu_si128((__m128i*)(fref + ((12) * frefstride) + 16));<br>
+        T11 = _mm_loadu_si128((__m128i*)(fref + ((13) * frefstride) + 16));<br>
+        T12 = _mm_loadu_si128((__m128i*)(fref + ((14) * frefstride) + 16));<br>
+        T13 = _mm_loadu_si128((__m128i*)(fref + ((15) * frefstride) + 16));<br>
+<br>
+        T20 = _mm_sad_epu8(T00, T10);<br>
+        T21 = _mm_sad_epu8(T01, T11);<br>
+        T22 = _mm_sad_epu8(T02, T12);<br>
+        T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+        sum0 = _mm_add_epi32(sum0, T20);<br>
+        sum0 = _mm_add_epi32(sum0, T21);<br>
+        sum0 = _mm_add_epi32(sum0, T22);<br>
+        sum0 = _mm_add_epi32(sum0, T23);<br>
+    }<br>
+    else if ((ly % 8) == 0)<br>
+    {<br>
+        for (int i = 0; i < ly; i += 8)<br>
+        {<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + (i) * fencstride));<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));<br>
+<br>
+            T10 = _mm_loadu_si128((__m128i*)(fref + (i) * frefstride));<br>
+            T11 = _mm_loadu_si128((__m128i*)(fref + (i + 1) * frefstride));<br>
+            T12 = _mm_loadu_si128((__m128i*)(fref + (i + 2) * frefstride));<br>
+            T13 = _mm_loadu_si128((__m128i*)(fref + (i + 3) * frefstride));<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+            sum0 = _mm_add_epi32(sum0, T22);<br>
+            sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + ((i) * fencstride) + 16));<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + ((i + 1) * fencstride) + 16));<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + ((i + 2) * fencstride) + 16));<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + ((i + 3) * fencstride) + 16));<br>
+<br>
+            T10 = _mm_loadu_si128((__m128i*)(fref + ((i) * frefstride) + 16));<br>
+            T11 = _mm_loadu_si128((__m128i*)(fref + ((i + 1) * frefstride) + 16));<br>
+            T12 = _mm_loadu_si128((__m128i*)(fref + ((i + 2) * frefstride) + 16));<br>
+            T13 = _mm_loadu_si128((__m128i*)(fref + ((i + 3) * frefstride) + 16));<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+            sum0 = _mm_add_epi32(sum0, T22);<br>
+            sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + (i + 4) * fencstride));<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + (i + 5) * fencstride));<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + (i + 6) * fencstride));<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + (i + 7) * fencstride));<br>
+<br>
+            T10 = _mm_loadu_si128((__m128i*)(fref + (i + 4) * frefstride));<br>
+            T11 = _mm_loadu_si128((__m128i*)(fref + (i + 5) * frefstride));<br>
+            T12 = _mm_loadu_si128((__m128i*)(fref + (i + 6) * frefstride));<br>
+            T13 = _mm_loadu_si128((__m128i*)(fref + (i + 7) * frefstride));<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+            sum0 = _mm_add_epi32(sum0, T22);<br>
+            sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + ((i + 4) * fencstride) + 16));<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + ((i + 5) * fencstride) + 16));<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + ((i + 6) * fencstride) + 16));<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + ((i + 7) * fencstride) + 16));<br>
+<br>
+            T10 = _mm_loadu_si128((__m128i*)(fref + ((i + 4) * frefstride) + 16));<br>
+            T11 = _mm_loadu_si128((__m128i*)(fref + ((i + 5) * frefstride) + 16));<br>
+            T12 = _mm_loadu_si128((__m128i*)(fref + ((i + 6) * frefstride) + 16));<br>
+            T13 = _mm_loadu_si128((__m128i*)(fref + ((i + 7) * frefstride) + 16));<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+            sum0 = _mm_add_epi32(sum0, T22);<br>
+            sum0 = _mm_add_epi32(sum0, T23);<br>
+        }<br>
+    }<br>
+    else if ((ly % 4) == 0)<br>
+    {<br>
+        for (int i = 0; i < ly; i += 4)<br>
+        {<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + (i) * fencstride));<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));<br>
+<br>
+            T10 = _mm_loadu_si128((__m128i*)(fref + (i) * frefstride));<br>
+            T11 = _mm_loadu_si128((__m128i*)(fref + (i + 1) * frefstride));<br>
+            T12 = _mm_loadu_si128((__m128i*)(fref + (i + 2) * frefstride));<br>
+            T13 = _mm_loadu_si128((__m128i*)(fref + (i + 3) * frefstride));<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+            sum0 = _mm_add_epi32(sum0, T22);<br>
+            sum0 = _mm_add_epi32(sum0, T23);<br>
+<br>
+            T00 = _mm_load_si128((__m128i*)(fenc + ((i) * fencstride) + 16));<br>
+            T01 = _mm_load_si128((__m128i*)(fenc + ((i + 1) * fencstride) + 16));<br>
+            T02 = _mm_load_si128((__m128i*)(fenc + ((i + 2) * fencstride) + 16));<br>
+            T03 = _mm_load_si128((__m128i*)(fenc + ((i + 3) * fencstride) + 16));<br>
+<br>
+            T10 = _mm_loadu_si128((__m128i*)(fref + ((i + 0) * frefstride) + 16));<br>
+            T11 = _mm_loadu_si128((__m128i*)(fref + ((i + 1) * frefstride) + 16));<br>
+            T12 = _mm_loadu_si128((__m128i*)(fref + ((i + 2) * frefstride) + 16));<br>
+            T13 = _mm_loadu_si128((__m128i*)(fref + ((i + 3) * frefstride) + 16));<br>
+<br>
+            T20 = _mm_sad_epu8(T00, T10);<br>
+            T21 = _mm_sad_epu8(T01, T11);<br>
+            T22 = _mm_sad_epu8(T02, T12);<br>
+            T23 = _mm_sad_epu8(T03, T13);<br>
+<br>
+            sum0 = _mm_add_epi32(sum0, T20);<br>
+            sum0 = _mm_add_epi32(sum0, T21);<br>
+            sum0 = _mm_add_epi32(sum0, T22);<br>
+            sum0 = _mm_add_epi32(sum0, T23);<br>
+        }<br>
+    }<br>
+<br>
+    sum1 = _mm_shuffle_epi32(sum0, 2);<br>
+    sum0 = _mm_add_epi32(sum0, sum1);<br>
+<br>
+    return _mm_cvtsi128_si32(sum0);<br>
 }</div><div class="gmail_quote"><br></div><div class="gmail_quote">>> Above comments are applied to all three functions and please no more than one function in a single patch. <br>
<br>
 template<int size></div><div class="gmail_quote"><br></div><div class="gmail_quote">Regards,</div><div class="gmail_quote">Praveen<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</div><br></div>