[x265-commits] [x265] asm: assembly code for pixel_satd_64x32 and pixel_satd_64x48

Fri Nov 15 05:26:47 CET 2013

details:   http://hg.videolan.org/x265/rev/ed1dab579cb1
branches:  
changeset: 5088:ed1dab579cb1
user:      Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date:      Thu Nov 14 16:42:05 2013 +0530
description:
asm: assembly code for pixel_satd_64x32 and pixel_satd_64x48
Subject: [x265] asm: assembly code for pixel_satd_64x64

details:   http://hg.videolan.org/x265/rev/99b64d267788
branches:  
changeset: 5089:99b64d267788
user:      Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date:      Thu Nov 14 17:34:34 2013 +0530
description:
asm: assembly code for pixel_satd_64x64
Subject: [x265] asm: assembly code for pixel_satd_32x64 and pixel_satd_48x64

details:   http://hg.videolan.org/x265/rev/84f9ced21747
branches:  
changeset: 5090:84f9ced21747
user:      Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date:      Thu Nov 14 19:08:25 2013 +0530
description:
asm: assembly code for pixel_satd_32x64 and pixel_satd_48x64
Subject: [x265] Unit test code for Pixel scaling

details:   http://hg.videolan.org/x265/rev/38e124ec202c
branches:  
changeset: 5091:38e124ec202c
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Thu Nov 14 14:58:40 2013 +0530
description:
Unit test code for Pixel scaling
Subject: [x265] asm: code for scale1D_128to64 routine

details:   http://hg.videolan.org/x265/rev/05484f075744
branches:  
changeset: 5092:05484f075744
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Thu Nov 14 16:57:19 2013 +0530
description:
asm: code for scale1D_128to64 routine
Subject: [x265] reverted chroma_copy_pp asm integration code, avoiding csp break

details:   http://hg.videolan.org/x265/rev/b0ce6bd99b15
branches:  
changeset: 5093:b0ce6bd99b15
user:      Praveen Tiwari
date:      Thu Nov 14 15:26:09 2013 +0530
description:
reverted chroma_copy_pp asm integration code, avoiding csp break
Subject: [x265] no-rdo early exit: giving weightage to the cost of that CU and neighbour CU's for early exit

details:   http://hg.videolan.org/x265/rev/1a033fe23a3e
branches:  
changeset: 5094:1a033fe23a3e
user:      Sumalatha Polureddy
date:      Thu Nov 14 16:35:13 2013 +0530
description:
no-rdo early exit: giving weightage to the cost of that CU and neighbour CU's for early exit

Early exit is done when CU cost at depth "n" is lessthan sum of 60% of avgcost of
that CU at same depth and 40% of avgcost of neighbour CU's at same depth.

the performance, bitrate increase and psnr comparison are given below
CLI: x265.exe input.y4m -o abc.hevc -r recon.y4m --rd 1 --ref 1

BasketballDrive_1920x1080_50
Timetaken to encode: 704/585/564s
bitrate: 3650/3696/3696
PSNR: 36.7/36.67/36.67
perf improvement: 16.9% (compared to early exit OFF and already existing early exit)
perf improvement: 19.8% (compared to early exit OFF and new early exit)

Cactus_1920x1080_50
Timetaken to encode: 526/443/436s
bitrate: 2787/2831/2833
PSNR: 35.527/35.48/35.48
perf improvement: 15.7% (compared to early exit OFF and already existing early exit)
perf improvement: 17.1% (compared to early exit OFF and new early exit)

Kimono1_1920x1080_24
Timetaken to encode: 279/235/238s
bitrate: 1243/1252/1252
PSNR: 38.16/38.158/38.159
perf improvement: 15.7% (compared to early exit OFF and already existing early exit)
perf improvement: 14.6% (compared to early exit OFF and new early exit)

FourPeople_1280x720_60
Timetaken to encode: 169/157/157s  16.9%/19.8%
bitrate: 486/489/489
PSNR: 39.09/39.052/39.042
perf improvement: 7.1% (compared to early exit OFF and already existing early exit)
perf improvement: 7.1% (compared to early exit OFF and new early exit)

big_buck_bunny_360p24
Timetaken to encode: 1739/1511/1505s  16.9%/19.8%
bitrate: 174.9/175.38/175.5
PSNR: 37.798/37.746/37.752
perf improvement: 13.1% (compared to early exit OFF and already existing early exit)
perf improvement: 13.4% (compared to early exit OFF and new early exit)

PartyScene_832x480_50
Timetaken to encode: 123/120/120s  16.9%/19.8%
bitrate: 208/208/208
PSNR: 40.344/40.33/40.332
perf improvement: 2.4% (compared to early exit OFF and already existing early exit)
perf improvement: 2.4% (compared to early exit OFF and new early exit)
Subject: [x265] asm: assembly code for calcrecon[]

details:   http://hg.videolan.org/x265/rev/1b9545e23e36
branches:  
changeset: 5095:1b9545e23e36
user:      Min Chen <chenm003 at 163.com>
date:      Thu Nov 14 16:45:03 2013 +0800
description:
asm: assembly code for calcrecon[]
Subject: [x265] Pulling x264 weight decision into x265 lookahead

details:   http://hg.videolan.org/x265/rev/61f9fc2e91d2
branches:  
changeset: 5096:61f9fc2e91d2
user:      Shazeb Nawaz Khan <shazeb at multicorewareinc.com>
date:      Thu Nov 14 18:04:11 2013 +0530
description:
Pulling x264 weight decision into x265 lookahead
Subject: [x265] Using weighted lowres ref frames in cost estimation in lookahead

details:   http://hg.videolan.org/x265/rev/899731955c6d
branches:  
changeset: 5097:899731955c6d
user:      Shazeb Nawaz Khan <shazeb at multicorewareinc.com>
date:      Thu Nov 14 18:36:33 2013 +0530
description:
Using weighted lowres ref frames in cost estimation in lookahead
Subject: [x265] slicetype: optimize away mcWeight helper function

details:   http://hg.videolan.org/x265/rev/ba00da135945
branches:  
changeset: 5098:ba00da135945
user:      Steve Borho <steve at borho.org>
date:      Thu Nov 14 13:18:13 2013 -0600
description:
slicetype: optimize away mcWeight helper function
Subject: [x265] slicetype: since w is an auto-var there is no need to zero at early-outs

details:   http://hg.videolan.org/x265/rev/02fd5b099fa3
branches:  
changeset: 5099:02fd5b099fa3
user:      Steve Borho <steve at borho.org>
date:      Thu Nov 14 13:26:33 2013 -0600
description:
slicetype: since w is an auto-var there is no need to zero at early-outs
Subject: [x265] slicetype: use x265 style camelCase auto vars

details:   http://hg.videolan.org/x265/rev/82b9f30398ae
branches:  
changeset: 5100:82b9f30398ae
user:      Steve Borho <steve at borho.org>
date:      Thu Nov 14 13:27:48 2013 -0600
description:
slicetype: use x265 style camelCase auto vars
Subject: [x265] slicetype: remove unnecessary lines, simplify a few things

details:   http://hg.videolan.org/x265/rev/31bbe5e1142e
branches:  
changeset: 5101:31bbe5e1142e
user:      Steve Borho <steve at borho.org>
date:      Thu Nov 14 13:31:20 2013 -0600
description:
slicetype: remove unnecessary lines, simplify a few things
Subject: [x265] slicetype: correct non-denom round, improve some comments

details:   http://hg.videolan.org/x265/rev/ee42f57411ae
branches:  
changeset: 5102:ee42f57411ae
user:      Steve Borho <steve at borho.org>
date:      Thu Nov 14 13:38:07 2013 -0600
description:
slicetype: correct non-denom round, improve some comments

diffstat:

 source/Lib/TLibCommon/TComSlice.h     |    15 +
 source/Lib/TLibEncoder/TEncSearch.cpp |     4 +-
 source/common/lowres.cpp              |    14 +
 source/common/lowres.h                |     5 +-
 source/common/x86/asm-primitives.cpp  |    20 +-
 source/common/x86/pixel-a.asm         |  1192 +++++++++++++++++++++++++++++++++
 source/common/x86/pixel-util.asm      |   372 ++++++++++
 source/common/x86/pixel.h             |     7 +
 source/encoder/compress.cpp           |    81 +-
 source/encoder/slicetype.cpp          |   171 ++++-
 source/encoder/slicetype.h            |     6 +
 source/test/pixelharness.cpp          |    55 +-
 source/test/pixelharness.h            |     2 +
 13 files changed, 1869 insertions(+), 75 deletions(-)

diffs (truncated from 2322 to 300 lines):

diff -r e871fe75d5ab -r ee42f57411ae source/Lib/TLibCommon/TComSlice.h

--- a/source/Lib/TLibCommon/TComSlice.h	Wed Nov 13 13:52:43 2013 +0000
+++ b/source/Lib/TLibCommon/TComSlice.h	Thu Nov 14 13:38:07 2013 -0600
@@ -42,6 +42,7 @@
 #include "TComRom.h"
 #include "x265.h"  // NAL type enums
 #include "piclist.h"
+#include "common.h"
 
 #include <cstring>
 #include <assert.h>
@@ -1256,6 +1257,20 @@ struct WpScalingParam
 
     // Weighted prediction scaling values built from above parameters (bitdepth scaled):
     int         w, o, offset, shift, round;
+
+    /* makes a non-h265 weight (i.e. fix7), into an h265 weight */
+    void setFromWeightAndOffset(int weight, int offset)
+    {
+        inputOffset = offset;
+        log2WeightDenom = 7;
+        inputWeight = weight;
+        while (log2WeightDenom > 0 && (inputWeight > 127))
+        {
+            log2WeightDenom--;
+            inputWeight >>= 1;
+        }
+        inputWeight = X265_MIN(inputWeight, 127);
+    }
 };
 
 typedef WpScalingParam wpScalingParam;
diff -r e871fe75d5ab -r ee42f57411ae source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Wed Nov 13 13:52:43 2013 +0000
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Nov 14 13:38:07 2013 -0600
@@ -1812,14 +1812,14 @@ void TEncSearch::estIntraPredQT(TComData
                 if (bChromaSame)
                     primitives.luma_copy_pp[part](dst, dststride, src, srcstride);
                 else
-                    primitives.chroma_copy_pp[part](dst, dststride, src, srcstride);
+                    primitives.blockcpy_pp(compWidth, compHeight, dst, dststride, src, srcstride);
 
                 dst         = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);
                 src         = reconYuv->getCrAddr(partOffset);
                 if (bChromaSame)
                     primitives.luma_copy_pp[part](dst, dststride, src, srcstride);
                 else
-                    primitives.chroma_copy_pp[part](dst, dststride, src, srcstride);
+                    primitives.blockcpy_pp(compWidth, compHeight, dst, dststride, src, srcstride);
             }
         }
 
diff -r e871fe75d5ab -r ee42f57411ae source/common/lowres.cpp
--- a/source/common/lowres.cpp	Wed Nov 13 13:52:43 2013 +0000
+++ b/source/common/lowres.cpp	Thu Nov 14 13:38:07 2013 -0600
@@ -40,6 +40,7 @@ void Lowres::create(TComPic *pic, int bf
     int cuWidth = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     int cuCount = cuWidth * cuHeight;
+    extHeight = lines + 2 * orig->getLumaMarginY();
 
     /* rounding the width to multiple of lowres CU size */
     width = cuWidth * X265_LOWRES_CU_SIZE;
@@ -115,6 +116,19 @@ void Lowres::destroy(int bframes)
     X265_FREE(invQscaleFactor);
 }
 
+void Lowres::initWeighted(Lowres *ref, wpScalingParam *w)
+{
+    isWeighted = true;
+    int correction = (IF_INTERNAL_PREC - X265_DEPTH);
+    for (int i = 0; i < 4; i++)
+    {
+        // Adding (IF_INTERNAL_PREC - X265_DEPTH) to cancel effect of pixel to short conversion inside the primitive
+        primitives.weightpUniPixel(ref->buffer[i], this->buffer[i], lumaStride, lumaStride, lumaStride, extHeight, w->inputWeight, (1 << (w->log2WeightDenom - 1 + correction)), (w->log2WeightDenom + correction), w->inputOffset);
+    }
+
+    fpelPlane = lowresPlane[0];
+}
+
 // (re) initialize lowres state
 void Lowres::init(TComPicYuv *orig, int poc, int type, int bframes)
 {
diff -r e871fe75d5ab -r ee42f57411ae source/common/lowres.h
--- a/source/common/lowres.h	Wed Nov 13 13:52:43 2013 +0000
+++ b/source/common/lowres.h	Thu Nov 14 13:38:07 2013 -0600
@@ -33,6 +33,7 @@ namespace x265 {
 
 class TComPic;
 class TComPicYuv;
+typedef struct WpScalingParam wpScalingParam;
 
 struct ReferencePlanes
 {
@@ -61,7 +62,7 @@ struct ReferencePlanes
 
             MV qmvB = qmv + MV((qmv.x & 1) * 2, (qmv.y & 1) * 2);
             int hpelB = (qmvB.y & 2) | ((qmvB.x & 2) >> 1);
-            
+
             pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * lumaStride;
             primitives.pixelavg_pp[LUMA_8x8](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
             return buf;
@@ -100,6 +101,7 @@ struct ReferencePlanes
 struct Lowres : public ReferencePlanes
 {
     pixel *buffer[4];
+    int extHeight;
 
     int    frameNum;         // Presentation frame number
     int    sliceType;        // Slice type decided by lookahead
@@ -132,6 +134,7 @@ struct Lowres : public ReferencePlanes
     void create(TComPic *pic, int bframes, int32_t *aqMode);
     void destroy(int bframes);
     void init(TComPicYuv *orig, int poc, int sliceType, int bframes);
+    void initWeighted(Lowres *ref, wpScalingParam *w);
 };
 }
 
diff -r e871fe75d5ab -r ee42f57411ae source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Nov 13 13:52:43 2013 +0000
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 14 13:38:07 2013 -0600
@@ -61,11 +61,11 @@ extern "C" {
 #define HEVC_SATD(cpu) \
     p.satd[LUMA_32x32] = x265_pixel_satd_32x32_ ## cpu; \
     p.satd[LUMA_24x32] = x265_pixel_satd_24x32_ ## cpu; \
-    p.satd[LUMA_64x64] = cmp<64, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
-    p.satd[LUMA_64x32] = cmp<64, 32, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
-    p.satd[LUMA_32x64] = cmp<32, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
-    p.satd[LUMA_64x48] = cmp<64, 48, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
-    p.satd[LUMA_48x64] = cmp<48, 64, 16, 16, x265_pixel_satd_16x16_ ## cpu>; \
+    p.satd[LUMA_64x64] = x265_pixel_satd_64x64_ ## cpu; \
+    p.satd[LUMA_64x32] = x265_pixel_satd_64x32_ ## cpu; \
+    p.satd[LUMA_32x64] = x265_pixel_satd_32x64_ ## cpu; \
+    p.satd[LUMA_64x48] = x265_pixel_satd_64x48_ ## cpu; \
+    p.satd[LUMA_48x64] = x265_pixel_satd_48x64_ ## cpu; \
     p.satd[LUMA_64x16] = x265_pixel_satd_64x16_ ## cpu
 
 #define ASSGN_SSE(cpu) \
@@ -448,6 +448,8 @@ void Setup_Assembly_Primitives(EncoderPr
 
         p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
         p.ipfilter_ss[FILTER_V_S_S_8] = x265_interp_8tap_v_ss_sse2;
+        p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;
+        p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -460,6 +462,8 @@ void Setup_Assembly_Primitives(EncoderPr
         PIXEL_AVG(ssse3);
         PIXEL_AVG_W4(ssse3);
 
+        p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+
         p.sad_x4[LUMA_8x4] = x265_pixel_sad_x4_8x4_ssse3;
         p.sad_x4[LUMA_8x8] = x265_pixel_sad_x4_8x8_ssse3;
         p.sad_x3[LUMA_8x16] = x265_pixel_sad_x3_8x16_ssse3;
@@ -519,7 +523,7 @@ void Setup_Assembly_Primitives(EncoderPr
 
         CHROMA_FILTERS(_sse4);
         LUMA_FILTERS(_sse4);
-
+        HEVC_SATD(sse4);
         p.chroma_copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
         p.chroma_copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
         p.chroma_copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
@@ -527,6 +531,10 @@ void Setup_Assembly_Primitives(EncoderPr
         p.chroma_vsp[CHROMA_2x4] = x265_interp_4tap_vert_sp_2x4_sse4;
         p.chroma_vsp[CHROMA_2x8] = x265_interp_4tap_vert_sp_2x8_sse4;
         p.chroma_vsp[CHROMA_6x8] = x265_interp_4tap_vert_sp_6x8_sse4;
+
+        p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse4;
+        p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;
+        p.calcrecon[BLOCK_64x64] = x265_calcRecons64_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r e871fe75d5ab -r ee42f57411ae source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Nov 13 13:52:43 2013 +0000
+++ b/source/common/x86/pixel-a.asm	Thu Nov 14 13:38:07 2013 -0600
@@ -1318,6 +1318,45 @@ VAR2_8x8_AVX2 16, 7
 %endif
 %endmacro
 
+%macro SATD_8x4_1_SSE 10
+%if %1
+    HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
+%else
+    HADAMARD4_V %2, %3, %4, %5, %6
+    ; doing the abs first is a slight advantage
+    ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
+    ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
+    HADAMARD 1, max, %2, %4, %6, %7
+%endif
+
+    pxor m%10, m%10
+    mova m%9, m%2
+    punpcklwd m%9, m%10
+    paddd m%8, m%9
+    mova m%9, m%2
+    punpckhwd m%9, m%10
+    paddd m%8, m%9
+
+%if %1
+    pxor m%10, m%10
+    mova m%9, m%4
+    punpcklwd m%9, m%10
+    paddd m%8, m%9
+    mova m%9, m%4
+    punpckhwd m%9, m%10
+    paddd m%8, m%9
+%else
+    HADAMARD 1, max, %3, %5, %6, %7
+    pxor m%10, m%10
+    mova m%9, m%3
+    punpcklwd m%9, m%10
+    paddd m%8, m%9
+    mova m%9, m%3
+    punpckhwd m%9, m%10
+    paddd m%8, m%9
+%endif
+%endmacro
+
 %macro SATD_START_MMX 0
     FIX_STRIDES r1, r3
     lea  r4, [3*r1] ; 3*stride1
@@ -1650,6 +1689,20 @@ cglobal pixel_satd_8x8_internal
     SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
     ret
 
+cglobal pixel_satd_8x8_internal2
+%if WIN64
+    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+    SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
+    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+    SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 12, 13
+%else
+    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+    SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
+    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+    SATD_8x4_1_SSE vertical, 0, 1, 2, 3, 4, 5, 6, 4, 5
+%endif
+    ret
+
 ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
 ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
 %if HIGH_BIT_DEPTH == 0 && (WIN64 || UNIX64) && notcpuflag(avx)
@@ -1663,6 +1716,14 @@ cglobal pixel_satd_16x4_internal
     SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
     ret
 
+cglobal pixel_satd_16x4_internal2
+    LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
+    lea  r2, [r2+4*r3]
+    lea  r0, [r0+4*r1]
+    SATD_8x4_1_SSE 0, 0, 1, 2, 3, 6, 11, 10, 12, 13
+    SATD_8x4_1_SSE 0, 4, 8, 5, 9, 6, 3, 10, 12, 13
+    ret
+
 cglobal pixel_satd_16x4, 4,6,12
     SATD_START_SSE2 m10, m7
 %if vertical
@@ -1819,6 +1880,122 @@ cglobal pixel_satd_32x32, 4,8,8    ;if W
     call pixel_satd_16x4_internal
     SATD_END_SSE2 m10
 
+cglobal pixel_satd_32x64, 4,8,8    ;if WIN64 && notcpuflag(avx)
+    SATD_START_SSE2 m10, m7
+    mov r6, r0
+    mov r7, r2
+%if vertical
+    mova m7, [pw_00ff]
+%endif
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    lea r0, [r6 + 16]
+    lea r2, [r7 + 16]
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2
+    call pixel_satd_16x4_internal2