[x265-commits] [x265] asm: interp_4tap_horiz_pp sse3

Wed Apr 22 18:31:36 CEST 2015

details:   http://hg.videolan.org/x265/rev/878ae164f423
branches:  
changeset: 10248:878ae164f423
user:      David T Yuen <dtyx265 at gmail.com>
date:      Tue Apr 21 20:37:47 2015 -0700
description:
asm: interp_4tap_horiz_pp sse3

This replaces c code for 6x8, 6x16, 8x2, 8x4, 8x6, 8x8, 8x12, 8x16, 8x32, 8x64, 12x16, 12x32, 16x8, 16x12,
16x16, 16x24, 16x32, 16x64, 24x32, 24x64, 32x8, 32x16, 32x24, 32x32, 32x48, 32x64, 48x64, 64x16, 64x32,
64x48, 64x64

Macros are used to add the primitives to asm-primitives.cpp

64-bit

./test/TestBench --testbench interp | grep hpp
chroma_hpp[  8x8]	3.02x 	 3087.49  	 9315.21
chroma_hpp[16x16]	3.09x 	 11813.11 	 36504.26
chroma_hpp[32x32]	3.45x 	 46862.27 	 161615.81
chroma_hpp[  8x4]	2.94x 	 1567.50  	 4614.73
chroma_hpp[ 16x8]	3.10x 	 5930.00  	 18377.70
chroma_hpp[ 8x16]	3.02x 	 6130.00  	 18520.00
chroma_hpp[32x16]	3.46x 	 23330.07 	 80829.76
chroma_hpp[16x32]	3.16x 	 23572.66 	 74452.23
chroma_hpp[  8x6]	2.93x 	 2339.99  	 6863.20
chroma_hpp[  6x8]	2.52x 	 2812.50  	 7075.69
chroma_hpp[  8x2]	2.25x 	 812.50   	 1830.00
chroma_hpp[16x12]	3.10x 	 8875.07  	 27545.60
chroma_hpp[12x16]	2.80x 	 9810.20  	 27476.36
chroma_hpp[ 16x4]	3.05x 	 2995.20  	 9144.59
chroma_hpp[32x24]	3.45x 	 34970.45 	 120594.13
chroma_hpp[24x32]	3.49x 	 35116.79 	 122662.94
chroma_hpp[ 32x8]	3.45x 	 11699.82 	 40402.34
chroma_hpp[ 8x32]	3.00x 	 12210.00 	 36603.46
chroma_hpp[ 8x16]	3.02x 	 6130.00  	 18520.00
chroma_hpp[16x32]	3.09x 	 23573.10 	 72827.95
chroma_hpp[32x64]	3.58x 	 93938.63 	 335978.50
chroma_hpp[  8x8]	3.02x 	 3087.49  	 9314.74
chroma_hpp[16x16]	3.09x 	 11815.00 	 36545.97
chroma_hpp[ 8x32]	3.02x 	 12212.27 	 36870.14
chroma_hpp[32x32]	3.45x 	 46748.56 	 161259.67
chroma_hpp[16x64]	3.18x 	 47185.50 	 150017.53
chroma_hpp[ 8x12]	3.04x 	 4607.50  	 14000.63
chroma_hpp[ 6x16]	2.49x 	 5570.10  	 13870.01
chroma_hpp[  8x4]	2.94x 	 1570.00  	 4613.64
chroma_hpp[16x24]	3.08x 	 17690.69 	 54547.18
chroma_hpp[12x32]	2.80x 	 19618.33 	 54833.57
chroma_hpp[ 16x8]	3.10x 	 5932.57  	 18377.34
chroma_hpp[32x48]	3.45x 	 70041.92 	 241370.78
chroma_hpp[24x64]	3.53x 	 70596.84 	 249020.33
chroma_hpp[32x16]	3.44x 	 23374.66 	 80340.53
chroma_hpp[ 8x64]	3.00x 	 24422.17 	 73313.97
chroma_hpp[  8x8]	3.01x 	 3090.00  	 9314.26
chroma_hpp[16x16]	3.11x 	 11810.00 	 36736.14
chroma_hpp[32x32]	3.47x 	 46771.40 	 162154.16
chroma_hpp[64x64]	3.25x 	 195843.97 	 636910.44
chroma_hpp[  8x4]	2.94x 	 1570.00  	 4613.35
chroma_hpp[ 16x8]	3.10x 	 5933.42  	 18381.31
chroma_hpp[ 8x16]	3.02x 	 6131.43  	 18520.17
chroma_hpp[32x16]	3.42x 	 23450.76 	 80160.37
chroma_hpp[16x32]	3.09x 	 23619.58 	 73027.41
chroma_hpp[64x32]	3.42x 	 92894.85 	 318107.38
chroma_hpp[32x64]	3.48x 	 93646.98 	 325950.78
chroma_hpp[16x12]	3.10x 	 8874.99  	 27503.11
chroma_hpp[12x16]	2.83x 	 9809.99  	 27769.48
chroma_hpp[ 16x4]	3.05x 	 2994.99  	 9138.53
chroma_hpp[32x24]	3.42x 	 35123.29 	 120115.27
chroma_hpp[24x32]	3.53x 	 35143.41 	 124032.27
chroma_hpp[ 32x8]	3.46x 	 11692.58 	 40400.25
chroma_hpp[ 8x32]	3.02x 	 12212.50 	 36843.57
chroma_hpp[64x48]	3.36x 	 140979.36 	 473912.28
chroma_hpp[48x64]	3.43x 	 140712.88 	 482047.69
chroma_hpp[64x16]	3.39x 	 46530.16 	 157859.31
chroma_hpp[16x64]	3.08x 	 47197.85 	 145477.02

32-bit

./test/TestBench --testbench interp | grep hpp
chroma_hpp[  8x8]	2.96x 	 3164.98  	 9354.15
chroma_hpp[16x16]	3.07x 	 11885.01 	 36438.13
chroma_hpp[32x32]	3.48x 	 46818.91 	 162929.45
chroma_hpp[  8x4]	2.86x 	 1645.00  	 4703.57
chroma_hpp[ 16x8]	3.06x 	 6005.10  	 18378.64
chroma_hpp[ 8x16]	2.97x 	 6205.00  	 18429.90
chroma_hpp[32x16]	3.46x 	 23463.52 	 81110.52
chroma_hpp[16x32]	3.10x 	 23700.07 	 73429.12
chroma_hpp[  8x6]	2.89x 	 2404.99  	 6942.73
chroma_hpp[  6x8]	2.46x 	 2905.00  	 7155.45
chroma_hpp[  8x2]	2.69x 	 885.00   	 2379.96
chroma_hpp[16x12]	3.07x 	 8945.04  	 27458.99
chroma_hpp[12x16]	2.81x 	 9862.55  	 27753.80
chroma_hpp[ 16x4]	3.01x 	 3065.00  	 9231.22
chroma_hpp[32x24]	3.45x 	 35140.03 	 121204.09
chroma_hpp[24x32]	3.51x 	 35262.80 	 123779.88
chroma_hpp[ 32x8]	3.47x 	 11765.00 	 40847.72
chroma_hpp[ 8x32]	2.98x 	 12285.00 	 36623.77
chroma_hpp[ 8x16]	2.97x 	 6205.00  	 18429.95
chroma_hpp[16x32]	3.08x 	 23691.43 	 72971.20
chroma_hpp[32x64]	3.47x 	 93595.39 	 324758.03
chroma_hpp[  8x8]	2.95x 	 3165.39  	 9353.01
chroma_hpp[16x16]	3.07x 	 11885.00 	 36438.18
chroma_hpp[ 8x32]	2.98x 	 12285.21 	 36614.84
chroma_hpp[32x32]	3.48x 	 46794.59 	 162647.84
chroma_hpp[16x64]	3.08x 	 47299.79 	 145605.62
chroma_hpp[ 8x12]	2.98x 	 4685.06  	 13949.95
chroma_hpp[ 6x16]	2.46x 	 5672.50  	 13972.76
chroma_hpp[  8x4]	2.86x 	 1645.00  	 4702.53
chroma_hpp[16x24]	3.06x 	 17765.06 	 54398.70
chroma_hpp[12x32]	2.79x 	 19676.93 	 54843.11
chroma_hpp[ 16x8]	3.06x 	 6005.12  	 18377.65
chroma_hpp[32x48]	3.46x 	 70176.74 	 243033.73
chroma_hpp[24x64]	3.51x 	 70367.40 	 246988.72
chroma_hpp[32x16]	3.47x 	 23405.43 	 81235.64
chroma_hpp[ 8x64]	2.97x 	 24490.71 	 72757.92
chroma_hpp[  8x8]	2.95x 	 3165.00  	 9352.45
chroma_hpp[16x16]	3.07x 	 11885.00 	 36437.35
chroma_hpp[32x32]	3.48x 	 46781.39 	 162731.84
chroma_hpp[64x64]	3.28x 	 193972.66 	 635870.62
chroma_hpp[  8x4]	2.86x 	 1645.00  	 4702.79
chroma_hpp[ 16x8]	3.06x 	 6005.00  	 18377.74
chroma_hpp[ 8x16]	2.97x 	 6205.04  	 18430.28
chroma_hpp[32x16]	3.46x 	 23452.05 	 81121.86
chroma_hpp[16x32]	3.07x 	 23695.18 	 72740.23
chroma_hpp[64x32]	3.42x 	 92974.16 	 317723.12
chroma_hpp[32x64]	3.47x 	 93467.95 	 324431.16
chroma_hpp[16x12]	3.07x 	 8945.09  	 27457.70
chroma_hpp[12x16]	2.79x 	 9862.54  	 27477.89
chroma_hpp[ 16x4]	3.01x 	 3065.02  	 9231.55
chroma_hpp[32x24]	3.45x 	 35161.96 	 121188.20
chroma_hpp[24x32]	3.51x 	 35275.57 	 123776.31
chroma_hpp[ 32x8]	3.47x 	 11765.00 	 40847.59
chroma_hpp[ 8x32]	2.98x 	 12285.06 	 36637.80
chroma_hpp[64x48]	3.41x 	 139693.42 	 476274.88
chroma_hpp[48x64]	3.44x 	 139707.61 	 480515.22
chroma_hpp[64x16]	3.41x 	 46575.90 	 158769.59
chroma_hpp[16x64]	3.08x 	 47262.82 	 145408.81
Subject: [x265] sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately

details:   http://hg.videolan.org/x265/rev/ee1f11edcb30
branches:  
changeset: 10249:ee1f11edcb30
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Wed Apr 22 11:59:36 2015 +0530
description:
sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately
Subject: [x265] asm: saoCuOrgE3 avx2 code for width>16: improve 508c->427c

details:   http://hg.videolan.org/x265/rev/996d9020066d
branches:  
changeset: 10250:996d9020066d
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Wed Apr 22 12:25:09 2015 +0530
description:
asm: saoCuOrgE3 avx2 code for width>16: improve 508c->427c
Subject: [x265] asm: avx2 code for satd_16xN, improved over ~50% than SSE code

details:   http://hg.videolan.org/x265/rev/065a227130f9
branches:  
changeset: 10251:065a227130f9
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Wed Apr 22 11:19:18 2015 +0530
description:
asm: avx2 code for satd_16xN, improved over ~50% than SSE code
Subject: [x265] asm: avx2 code for satd_48x64 and 64xN, improved over ~50% than SSE

details:   http://hg.videolan.org/x265/rev/e6f6db57a39e
branches:  
changeset: 10252:e6f6db57a39e
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Wed Apr 22 11:27:39 2015 +0530
description:
asm: avx2 code for satd_48x64 and 64xN, improved over ~50% than SSE
Subject: [x265] asm: avx2 10bit code for scale2D_64to32

details:   http://hg.videolan.org/x265/rev/f1dc261cbfc3
branches:  
changeset: 10253:f1dc261cbfc3
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Wed Apr 22 19:52:06 2015 +0530
description:
asm: avx2 10bit code for scale2D_64to32

AVX2:
scale2D_64to32  17.07x   3873.16   44301.99
SSSE3:
scale2D_64to32  2.75x    14407.30  39553.04
Subject: [x265] asm: fix bug in generic version findPosLast_x64 and improve testbench on it

details:   http://hg.videolan.org/x265/rev/dc81ab3cc80e
branches:  
changeset: 10254:dc81ab3cc80e
user:      Min Chen <chenm003 at 163.com>
date:      Wed Apr 22 21:30:33 2015 +0800
description:
asm: fix bug in generic version findPosLast_x64 and improve testbench on it
Subject: [x265] asm: rename findPosLast to scanPosLast and modify its API

details:   http://hg.videolan.org/x265/rev/649d12bcfbf2
branches:  
changeset: 10255:649d12bcfbf2
user:      Min Chen <chenm003 at 163.com>
date:      Wed Apr 22 21:30:37 2015 +0800
description:
asm: rename findPosLast to scanPosLast and modify its API
Subject: [x265] testbench: support BMI2

details:   http://hg.videolan.org/x265/rev/ee74adac6826
branches:  
changeset: 10256:ee74adac6826
user:      Min Chen <chenm003 at 163.com>
date:      Wed Apr 22 21:31:11 2015 +0800
description:
testbench: support BMI2
Subject: [x265] testbench: fix testbench crash when no coeff in block

details:   http://hg.videolan.org/x265/rev/9dce93005839
branches:  
changeset: 10257:9dce93005839
user:      Min Chen <chenm003 at 163.com>
date:      Wed Apr 22 21:31:15 2015 +0800
description:
testbench: fix testbench crash when no coeff in block
Subject: [x265] asm: avx2+bmi2 version of scanPosLast, 27.6k -> 6.8k cycles

details:   http://hg.videolan.org/x265/rev/e52ac44c7a49
branches:  
changeset: 10258:e52ac44c7a49
user:      Min Chen <chenm003 at 163.com>
date:      Wed Apr 22 21:31:19 2015 +0800
description:
asm: avx2+bmi2 version of scanPosLast, 27.6k -> 6.8k cycles
Subject: [x265] testbench: fix table fault when trSize more than 8

details:   http://hg.videolan.org/x265/rev/ea49b9759a4d
branches:  
changeset: 10259:ea49b9759a4d
user:      Min Chen <chenm003 at 163.com>
date:      Wed Apr 22 21:31:23 2015 +0800
description:
testbench: fix table fault when trSize more than 8
Subject: [x265] threadpool: use Win7 version macro directly, for more clarity

details:   http://hg.videolan.org/x265/rev/859daedfbb29
branches:  
changeset: 10260:859daedfbb29
user:      Steve Borho <steve at borho.org>
date:      Wed Apr 22 11:06:24 2015 -0500
description:
threadpool: use Win7 version macro directly, for more clarity

diffstat:

 source/common/dct.cpp                |    4 +-
 source/common/loopfilter.cpp         |   23 +-
 source/common/primitives.h           |    8 +-
 source/common/quant.cpp              |    4 +-
 source/common/threadpool.cpp         |    8 +-
 source/common/x86/asm-primitives.cpp |   45 ++-
 source/common/x86/const-a.asm        |    1 +
 source/common/x86/ipfilter8.asm      |  231 +++++++++++++++++++++
 source/common/x86/ipfilter8.h        |   32 +++
 source/common/x86/loopfilter.asm     |  159 ++++----------
 source/common/x86/loopfilter.h       |    2 +-
 source/common/x86/pixel-a.asm        |  372 +++++++++++++++++++++++++++++++++++
 source/common/x86/pixel-util.h       |    4 +-
 source/common/x86/pixel-util8.asm    |  174 ++++++++++++----
 source/encoder/entropy.cpp           |    2 +-
 source/encoder/sao.cpp               |   39 +--
 source/test/pixelharness.cpp         |   74 ++++--
 source/test/pixelharness.h           |    6 +-
 source/test/testbench.cpp            |    1 +
 19 files changed, 913 insertions(+), 276 deletions(-)

diffs (truncated from 1678 to 300 lines):

diff -r 86268e498680 -r 859daedfbb29 source/common/dct.cpp

--- a/source/common/dct.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/dct.cpp	Wed Apr 22 11:06:24 2015 -0500
@@ -752,7 +752,7 @@ void denoiseDct_c(int16_t* dctCoef, uint
     }
 }
 
-int findPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
+int scanPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
 {
     memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
     memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
@@ -848,7 +848,7 @@ void setupDCTPrimitives_c(EncoderPrimiti
     p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
     p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
 
-    p.findPosLast = findPosLast_c;
+    p.scanPosLast = scanPosLast_c;
     p.findPosFirstLast = findPosFirstLast_c;
 }
 }
diff -r 86268e498680 -r 859daedfbb29 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/loopfilter.cpp	Wed Apr 22 11:06:24 2015 -0500
@@ -122,25 +122,6 @@ void processSaoCUE3(pixel *rec, int8_t *
     }
 }
 
-void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
-{
-    int8_t signDown;
-    int8_t edgeType;
-
-    for (int y = 0; y < 2; y++)
-    {
-        for (int x = startX + 1; x < endX; x++)
-        {
-            signDown = signOf(rec[x] - rec[x + stride]);
-            edgeType = signDown + upBuff1[x] + 2;
-            upBuff1[x - 1] = -signDown;
-            rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
-        }
-        upBuff1[endX - 1] = upBuff[y];
-        rec += stride + 1;
-    }
-}
-
 void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
 {
     #define SAO_BO_BITS 5
@@ -164,8 +145,8 @@ void setupLoopFilterPrimitives_c(Encoder
     p.saoCuOrgE1 = processSaoCUE1;
     p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows;
     p.saoCuOrgE2 = processSaoCUE2;
-    p.saoCuOrgE3 = processSaoCUE3;
-    p.saoCuOrgE3_2Rows = processSaoCUE3_2Rows;
+    p.saoCuOrgE3[0] = processSaoCUE3;
+    p.saoCuOrgE3[1] = processSaoCUE3;
     p.saoCuOrgB0 = processSaoCUB0;
     p.sign = calSign;
 }
diff -r 86268e498680 -r 859daedfbb29 source/common/primitives.h
--- a/source/common/primitives.h	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/primitives.h	Wed Apr 22 11:06:24 2015 -0500
@@ -172,7 +172,6 @@ typedef void (*saoCuOrgE0_t)(pixel* rec,
 typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
-typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
@@ -180,7 +179,7 @@ typedef void (*planecopy_sp_t) (const ui
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
-typedef int (*findPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
 /* Function pointers to optimized encoder primitives. Each pointer can reference
@@ -278,8 +277,7 @@ struct EncoderPrimitives
     saoCuOrgE0_t          saoCuOrgE0;
     saoCuOrgE1_t          saoCuOrgE1, saoCuOrgE1_2Rows;
     saoCuOrgE2_t          saoCuOrgE2;
-    saoCuOrgE3_t          saoCuOrgE3;
-    saoCuOrgE3_2Rows_t    saoCuOrgE3_2Rows;
+    saoCuOrgE3_t          saoCuOrgE3[2];
     saoCuOrgB0_t          saoCuOrgB0;
 
     downscale_t           frameInitLowres;
@@ -293,7 +291,7 @@ struct EncoderPrimitives
     weightp_pp_t          weight_pp;
 
 
-    findPosLast_t         findPosLast;
+    scanPosLast_t         scanPosLast;
     findPosFirstLast_t    findPosFirstLast;
 
     /* There is one set of chroma primitives per color space. An encoder will
diff -r 86268e498680 -r 859daedfbb29 source/common/quant.cpp
--- a/source/common/quant.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/quant.cpp	Wed Apr 22 11:06:24 2015 -0500
@@ -580,12 +580,12 @@ uint32_t Quant::rdoQuant(const CUData& c
     uint16_t coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
 
 #if CHECKED_BUILD || _DEBUG
-    // clean output buffer, the asm version of findPosLast Never output anything after latest non-zero coeff group
+    // clean output buffer, the asm version of scanPosLast Never output anything after latest non-zero coeff group
     memset(coeffNum, 0, sizeof(coeffNum));
     memset(coeffSign, 0, sizeof(coeffNum));
     memset(coeffFlag, 0, sizeof(coeffNum));
 #endif
-    const int lastScanPos = primitives.findPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig);
+    const int lastScanPos = primitives.scanPosLast(codeParams.scan, dstCoeff, coeffSign, coeffFlag, coeffNum, numSig, g_scan4x4[codeParams.scanType], trSize);
     const int cgLastScanPos = (lastScanPos >> LOG2_SCAN_SET_SIZE);
 
 
diff -r 86268e498680 -r 859daedfbb29 source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/threadpool.cpp	Wed Apr 22 11:06:24 2015 -0500
@@ -232,7 +232,7 @@ ThreadPool* ThreadPool::allocThreadPools
     int cpuCount = getCpuCount();
     bool bNumaSupport = false;
 
-#if _WIN32_WINNT >= 0x0601
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
     bNumaSupport = true;
 #elif HAVE_LIBNUMA
     bNumaSupport = numa_available() >= 0;
@@ -241,7 +241,7 @@ ThreadPool* ThreadPool::allocThreadPools
 
     for (int i = 0; i < cpuCount; i++)
     {
-#if _WIN32_WINNT >= 0x0601
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
         UCHAR node;
         if (GetNumaProcessorNode((UCHAR)i, &node))
             cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
@@ -408,7 +408,7 @@ void ThreadPool::setCurrentThreadAffinit
 /* static */
 void ThreadPool::setThreadNodeAffinity(int numaNode)
 {
-#if _WIN32_WINNT >= 0x0601
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
     GROUP_AFFINITY groupAffinity;
     if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
     {
@@ -433,7 +433,7 @@ void ThreadPool::setThreadNodeAffinity(i
 /* static */
 int ThreadPool::getNumaNodeCount()
 {
-#if _WIN32_WINNT >= 0x0601
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
     ULONG num = 1;
     if (GetNumaHighestNodeNumber(&num))
         num++;
diff -r 86268e498680 -r 859daedfbb29 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 22 11:06:24 2015 -0500
@@ -801,7 +801,7 @@ void setupAssemblyPrimitives(EncoderPrim
 #endif
 
 #if X86_64
-    //p.findPosLast = x265_findPosLast_x64;
+    p.scanPosLast = x265_scanPosLast_x64;
 #endif
 
     if (cpuMask & X265_CPU_SSE2)
@@ -1196,6 +1196,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.dequant_normal  = x265_dequant_normal_avx2;
 
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+        p.scale2D_64to32 = x265_scale2D_64to32_avx2;
         // p.weight_pp = x265_weight_pp_avx2; fails tests
 
         p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
@@ -1268,8 +1269,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = x265_filterPixelToShort_32x48_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = x265_filterPixelToShort_32x64_avx2;
 
-        if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
-            p.findPosLast = x265_findPosLast_x64_bmi2;
+        if (cpuMask & X265_CPU_BMI2)
+            p.scanPosLast = x265_scanPosLast_avx2_bmi2;
     }
 }
 #else // if HIGH_BIT_DEPTH
@@ -1277,7 +1278,7 @@ void setupAssemblyPrimitives(EncoderPrim
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 8bpp
 {
 #if X86_64
-    //p.findPosLast = x265_findPosLast_x64;
+    p.scanPosLast = x265_scanPosLast_x64;
 #endif
 
     if (cpuMask & X265_CPU_SSE2)
@@ -1407,18 +1408,9 @@ void setupAssemblyPrimitives(EncoderPrim
     }
     if (cpuMask & X265_CPU_SSE3)
     {
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = x265_interp_4tap_horiz_pp_2x4_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = x265_interp_4tap_horiz_pp_4x2_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = x265_interp_4tap_horiz_pp_2x16_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp = x265_interp_4tap_horiz_pp_4x32_sse3;
+        ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -1519,8 +1511,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
         p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
         p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
-        p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
-        p.saoCuOrgE3_2Rows = x265_saoCuOrgE3_2Rows_sse4;
+        p.saoCuOrgE3[0] = x265_saoCuOrgE3_sse4;
+        p.saoCuOrgE3[1] = x265_saoCuOrgE3_sse4;
         p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
 
         LUMA_ADDAVG(sse4);
@@ -1728,7 +1720,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
         p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
         p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
-        p.saoCuOrgE3 = x265_saoCuOrgE3_avx2;
+        p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
+        p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
         p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
@@ -1846,11 +1839,21 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x16].satd  = x265_pixel_satd_8x16_avx2;
         p.pu[LUMA_8x8].satd   = x265_pixel_satd_8x8_avx2;
 
+        p.pu[LUMA_16x4].satd  = x265_pixel_satd_16x4_avx2;
+        p.pu[LUMA_16x12].satd = x265_pixel_satd_16x12_avx2;
+        p.pu[LUMA_16x32].satd = x265_pixel_satd_16x32_avx2;
+        p.pu[LUMA_16x64].satd = x265_pixel_satd_16x64_avx2;
+
         p.pu[LUMA_32x8].satd   = x265_pixel_satd_32x8_avx2;
         p.pu[LUMA_32x16].satd   = x265_pixel_satd_32x16_avx2;
         p.pu[LUMA_32x24].satd   = x265_pixel_satd_32x24_avx2;
         p.pu[LUMA_32x32].satd   = x265_pixel_satd_32x32_avx2;
         p.pu[LUMA_32x64].satd   = x265_pixel_satd_32x64_avx2;
+        p.pu[LUMA_48x64].satd   = x265_pixel_satd_48x64_avx2;
+        p.pu[LUMA_64x16].satd   = x265_pixel_satd_64x16_avx2;
+        p.pu[LUMA_64x32].satd   = x265_pixel_satd_64x32_avx2;
+        p.pu[LUMA_64x48].satd   = x265_pixel_satd_64x48_avx2;
+        p.pu[LUMA_64x64].satd   = x265_pixel_satd_64x64_avx2;
 
         p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;
         p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;
@@ -2400,8 +2403,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = x265_interp_4tap_horiz_ps_32x24_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2;
 
-        if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
-            p.findPosLast = x265_findPosLast_x64_bmi2;
+        if (cpuMask & X265_CPU_BMI2)
+            p.scanPosLast = x265_scanPosLast_avx2_bmi2;
     }
 #endif
 }
diff -r 86268e498680 -r 859daedfbb29 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/const-a.asm	Wed Apr 22 11:06:24 2015 -0500
@@ -37,6 +37,7 @@ const pb_2,                 times 32 db 
 const pb_3,                 times 16 db 3
 const pb_4,                 times 32 db 4
 const pb_8,                 times 32 db 8
+const pb_15,                times 32 db 15
 const pb_16,                times 32 db 16
 const pb_32,                times 32 db 32
 const pb_64,                times 32 db 64
diff -r 86268e498680 -r 859daedfbb29 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Wed Apr 22 11:06:24 2015 -0500
@@ -594,6 +594,237 @@ cglobal interp_4tap_horiz_pp_4x32, 4, 6,
     mov         [dstq + dststrideq], r4w
 %endmacro
 
+%macro FILTER_H4_w6_sse2 0
+    pxor        m4, m4
+    movh        m0, [srcq - 1]
+    movh        m5, [srcq]
+    punpckldq   m0, m5
+    movhlps     m2, m0
+    punpcklbw   m0, m4
+    punpcklbw   m2, m4
+    movd        m1, [srcq + 1]