[x265-commits] [x265] cmake: spacing nits

Deepthi Nandakumar deepthi at multicorewareinc.com
Thu May 14 02:56:54 CEST 2015


details:   http://hg.videolan.org/x265/rev/cff417129da5
branches:  stable
changeset: 10431:cff417129da5
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Wed May 13 13:10:16 2015 +0530
description:
cmake: spacing nits
Subject: [x265] pixel-util: correct typo, which prevented Windows header files from being included

details:   http://hg.videolan.org/x265/rev/971bd26ef18b
branches:  stable
changeset: 10432:971bd26ef18b
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Wed May 13 16:56:02 2015 +0530
description:
pixel-util: correct typo, which prevented Windows header files from being included
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/0ffdd0f379ad
branches:  
changeset: 10433:0ffdd0f379ad
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Wed May 13 18:12:50 2015 +0530
description:
Merge with stable
Subject: [x265] sao: improve calcSaoStatsCu, use local buffer to reduce array index operators

details:   http://hg.videolan.org/x265/rev/d9ee6b8a5e08
branches:  
changeset: 10434:d9ee6b8a5e08
user:      Min Chen <chenm003 at 163.com>
date:      Wed May 13 16:52:41 2015 -0700
description:
sao: improve calcSaoStatsCu, use local buffer to reduce array index operators
Subject: [x265] two inputs version of signOf

details:   http://hg.videolan.org/x265/rev/f37bee37def5
branches:  
changeset: 10435:f37bee37def5
user:      Min Chen <chenm003 at 163.com>
date:      Wed May 13 16:52:45 2015 -0700
description:
two inputs version of signOf
Subject: [x265] asm: enable sa8d_8x8_avx2, 375c -> 322c

details:   http://hg.videolan.org/x265/rev/98ad31a8bfa8
branches:  
changeset: 10436:98ad31a8bfa8
user:      Min Chen <chenm003 at 163.com>
date:      Wed May 13 16:52:56 2015 -0700
description:
asm: enable sa8d_8x8_avx2, 375c -> 322c
Subject: [x265] asm: AVX2 version sa8d[16x16], 1913c(AVX) -> 1620c(AVX2)

details:   http://hg.videolan.org/x265/rev/479087422e29
branches:  
changeset: 10437:479087422e29
user:      Min Chen <chenm003 at 163.com>
date:      Wed May 13 16:52:59 2015 -0700
description:
asm: AVX2 version sa8d[16x16], 1913c(AVX) -> 1620c(AVX2)

diffstat:

 source/CMakeLists.txt                |     4 +-
 source/common/x86/asm-primitives.cpp |    67 +
 source/common/x86/const-a.asm        |     2 +-
 source/common/x86/intrapred.h        |     1 +
 source/common/x86/intrapred16.asm    |   327 +++++++++
 source/common/x86/ipfilter8.asm      |  1219 +++++++++++++++++++++++++++------
 source/common/x86/ipfilter8.h        |    22 +
 source/common/x86/pixel-a.asm        |   108 +++
 source/common/x86/pixel-util.h       |     4 +-
 source/common/x86/sad16-a.asm        |   371 ++++++++++
 source/encoder/entropy.cpp           |    74 +-
 source/encoder/sao.cpp               |   109 ++-
 12 files changed, 2012 insertions(+), 296 deletions(-)

diffs (truncated from 2792 to 300 lines):

diff -r 37abdd8805b1 -r 479087422e29 source/CMakeLists.txt
--- a/source/CMakeLists.txt	Tue May 12 09:56:50 2015 -0500
+++ b/source/CMakeLists.txt	Wed May 13 16:52:59 2015 -0700
@@ -94,7 +94,7 @@ endif(X64 AND NOT WIN32)
 if(CMAKE_GENERATOR STREQUAL "Xcode")
   set(XCODE 1)
 endif()
-if (APPLE)
+if(APPLE)
   add_definitions(-DMACOS)
 endif()
 
@@ -296,7 +296,7 @@ if(WARNINGS_AS_ERRORS)
     endif()
 endif(WARNINGS_AS_ERRORS)
 
-if (WIN32)
+if(WIN32)
     # Visual leak detector
     find_package(VLD QUIET)
     if(VLD_FOUND)
diff -r 37abdd8805b1 -r 479087422e29 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue May 12 09:56:50 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed May 13 16:52:59 2015 -0700
@@ -1181,6 +1181,20 @@ void setupAssemblyPrimitives(EncoderPrim
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
+        p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
+        p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
+        p.cu[BLOCK_32x32].psy_cost_ss = x265_psyCost_ss_32x32_avx2;
+        p.cu[BLOCK_64x64].psy_cost_ss = x265_psyCost_ss_64x64_avx2;
+
+        p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_avx2;
+
+        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_avx2;
+        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_avx2;
+
+        p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_avx2;
+        p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
+
         p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_avx2;
 
         p.pu[LUMA_64x16].satd = x265_pixel_satd_64x16_avx2;
@@ -1264,6 +1278,12 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_16x12].sad = x265_pixel_sad_16x12_avx2;
         p.pu[LUMA_16x16].sad = x265_pixel_sad_16x16_avx2;
         p.pu[LUMA_16x32].sad = x265_pixel_sad_16x32_avx2;
+        p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_avx2;
+        p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;
+        p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;
+        p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_avx2;
+        p.pu[LUMA_32x32].sad = x265_pixel_sad_32x32_avx2;
+        p.pu[LUMA_32x64].sad = x265_pixel_sad_32x64_avx2;
 
         p.pu[LUMA_16x4].convert_p2s = x265_filterPixelToShort_16x4_avx2;
         p.pu[LUMA_16x8].convert_p2s = x265_filterPixelToShort_16x8_avx2;
@@ -1392,6 +1412,17 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_sse2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = x265_interp_4tap_vert_pp_8x16_sse2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = x265_interp_4tap_vert_pp_8x32_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vpp = x265_interp_4tap_vert_pp_12x16_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = x265_interp_4tap_vert_pp_16x4_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = x265_interp_4tap_vert_pp_16x8_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = x265_interp_4tap_vert_pp_16x12_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = x265_interp_4tap_vert_pp_16x32_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = x265_interp_4tap_vert_pp_24x32_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = x265_interp_4tap_vert_pp_32x8_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = x265_interp_4tap_vert_pp_32x16_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = x265_interp_4tap_vert_pp_32x24_sse2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_sse2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vpp = x265_interp_4tap_vert_pp_6x16_sse2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vpp = x265_interp_4tap_vert_pp_8x4_sse2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vpp = x265_interp_4tap_vert_pp_8x4_sse2;
@@ -1400,10 +1431,39 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = x265_interp_4tap_vert_pp_8x16_sse2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = x265_interp_4tap_vert_pp_8x32_sse2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = x265_interp_4tap_vert_pp_8x64_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vpp = x265_interp_4tap_vert_pp_12x32_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = x265_interp_4tap_vert_pp_16x8_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = x265_interp_4tap_vert_pp_16x24_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = x265_interp_4tap_vert_pp_16x32_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = x265_interp_4tap_vert_pp_16x64_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = x265_interp_4tap_vert_pp_24x64_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = x265_interp_4tap_vert_pp_32x16_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = x265_interp_4tap_vert_pp_32x48_sse2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = x265_interp_4tap_vert_pp_32x64_sse2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vpp = x265_interp_4tap_vert_pp_8x4_sse2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_sse2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = x265_interp_4tap_vert_pp_8x16_sse2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = x265_interp_4tap_vert_pp_8x32_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vpp = x265_interp_4tap_vert_pp_12x16_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = x265_interp_4tap_vert_pp_16x4_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = x265_interp_4tap_vert_pp_16x8_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = x265_interp_4tap_vert_pp_16x12_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = x265_interp_4tap_vert_pp_16x32_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = x265_interp_4tap_vert_pp_16x64_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vpp = x265_interp_4tap_vert_pp_24x32_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = x265_interp_4tap_vert_pp_32x8_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = x265_interp_4tap_vert_pp_32x16_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = x265_interp_4tap_vert_pp_32x24_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = x265_interp_4tap_vert_pp_32x64_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = x265_interp_4tap_vert_pp_48x64_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = x265_interp_4tap_vert_pp_64x16_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = x265_interp_4tap_vert_pp_64x32_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = x265_interp_4tap_vert_pp_64x48_sse2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = x265_interp_4tap_vert_pp_64x64_sse2;
 #endif
 
         ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
@@ -1880,6 +1940,11 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = x265_addAvg_32x48_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = x265_addAvg_32x64_avx2;
 
+        p.cu[BLOCK_8x8].sa8d = x265_pixel_sa8d_8x8_avx2;
+        p.cu[BLOCK_16x16].sa8d = x265_pixel_sa8d_16x16_avx2;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = x265_pixel_sa8d_8x8_avx2;
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = x265_pixel_sa8d_16x16_avx2;
+
         p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
         p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
         p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2;
@@ -2617,6 +2682,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vps = x265_interp_4tap_vert_ps_8x12_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vps = x265_interp_4tap_vert_ps_2x4_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = x265_interp_4tap_vert_ps_16x24_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vps = x265_interp_4tap_vert_ps_2x16_avx2;
 
         //i444 for chroma_vps
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
@@ -2662,6 +2728,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vpp = x265_interp_4tap_vert_pp_8x12_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = x265_interp_4tap_vert_pp_16x24_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vpp = x265_interp_4tap_vert_pp_2x16_avx2;
 
         //i444 for chroma_vpp
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
diff -r 37abdd8805b1 -r 479087422e29 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Tue May 12 09:56:50 2015 -0500
+++ b/source/common/x86/const-a.asm	Wed May 13 16:52:59 2015 -0700
@@ -62,7 +62,7 @@ const pb_000000000000000F,           db 
 ;; 16-bit constants
 
 const pw_1,                 times 16 dw 1
-const pw_2,                 times  8 dw 2
+const pw_2,                 times 16 dw 2
 const pw_m2,                times  8 dw -2
 const pw_4,                 times  8 dw 4
 const pw_8,                 times  8 dw 8
diff -r 37abdd8805b1 -r 479087422e29 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Tue May 12 09:56:50 2015 -0500
+++ b/source/common/x86/intrapred.h	Wed May 13 16:52:59 2015 -0700
@@ -34,6 +34,7 @@ void x265_intra_pred_dc4_sse4(pixel* dst
 void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
+void x265_intra_pred_dc16_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 
 void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
diff -r 37abdd8805b1 -r 479087422e29 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Tue May 12 09:56:50 2015 -0500
+++ b/source/common/x86/intrapred16.asm	Wed May 13 16:52:59 2015 -0700
@@ -89,7 +89,9 @@ cextern pw_1
 cextern pw_2
 cextern pw_4
 cextern pw_8
+cextern pw_15
 cextern pw_16
+cextern pw_31
 cextern pw_32
 cextern pw_1023
 cextern pd_16
@@ -103,6 +105,8 @@ cextern multi_2Row
 cextern pw_swap
 cextern pb_unpackwq1
 cextern pb_unpackwq2
+cextern pw_planar16_mul
+cextern pw_planar32_mul
 
 ;-----------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
@@ -448,6 +452,218 @@ cglobal intra_pred_dc32, 3, 4, 6
 %endrep
     RET
 
+;-------------------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
+;-------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_dc16, 3, 9, 4
+    mov             r3d,                 r4m
+    add             r1d,                 r1d
+    movu            m0,                  [r2 + 66]
+    movu            m2,                  [r2 +  2]
+    paddw           m0,                  m2
+
+    vextracti128    xm1,                 m0, 1
+    paddw           xm0,                 xm1
+    movhlps         xm1,                 xm0
+    paddw           xm0,                 xm1
+    phaddw          xm0,                 xm0
+    pmaddwd         xm0,                 [pw_1]
+    paddd           xm0,                 [pd_16]
+    psrad           xm0,                 5
+    movd            r5d,                 xm0
+    vpbroadcastw    m0,                  xm0
+
+    test            r3d,                 r3d
+
+    ; store DC 16x16
+    lea             r6,                  [r1 + r1 * 2]        ; index 3
+    lea             r7,                  [r1 + r1 * 4]        ; index 5
+    lea             r8,                  [r6 + r1 * 4]        ; index 7
+    lea             r4,                  [r0 + r8 * 1]        ; base + 7
+
+    movu            [r0],                m0
+    movu            [r0 + r1],           m0
+    movu            [r0 + r1 * 2],       m0
+    movu            [r0 + r6],           m0
+    movu            [r0 + r1 * 4],       m0
+    movu            [r0 + r7],           m0
+    movu            [r0 + r6 * 2],       m0
+    movu            [r4],                m0
+    movu            [r0 + r1 * 8],       m0
+    movu            [r4 + r1 * 2],       m0
+    movu            [r0 + r7 * 2],       m0
+    movu            [r4 + r1 * 4],       m0
+    movu            [r0 + r6 * 4],       m0
+    movu            [r4 + r6 * 2],       m0
+    movu            [r4 + r8],           m0
+    movu            [r4 + r1 * 8],       m0
+
+    ; Do DC Filter
+    jz              .end
+    mova            m1,                  [pw_2]
+    pmullw          m1,                  m0
+    paddw           m1,                  [pw_2]
+    movd            r3d,                 xm1
+    paddw           m1,                  m0
+
+    ; filter top
+    movu            m2,                  [r2 + 2]
+    paddw           m2,                  m1
+    psraw           m2,                  2
+    movu            [r0],                m2
+
+    ; filter top-left
+    movzx           r3d,                 r3w
+    movzx           r5d, word            [r2 + 66]
+    add             r3d,                 r5d
+    movzx           r5d, word            [r2 + 2]
+    add             r5d,                 r3d
+    shr             r5d,                 2
+    mov             [r0],                r5w
+
+    ; filter left
+    movu            m2,                  [r2 + 68]
+    paddw           m2,                  m1
+    psraw           m2,                  2
+    vextracti128    xm3,                 m2, 1
+
+    movq            r3,                  xm2
+    pshufd          xm2,                 xm2, 0xEE
+    mov             [r0 + r1],           r3w
+    shr             r3,                  16
+    mov             [r0 + r1 * 2],       r3w
+    shr             r3,                  16
+    mov             [r0 + r6],           r3w
+    shr             r3,                  16
+    mov             [r0 + r1 * 4],       r3w
+    movq            r3,                  xm2
+    mov             [r0 + r7],           r3w
+    shr             r3,                  16
+    mov             [r0 + r6 * 2],       r3w
+    shr             r3,                  16
+    mov             [r4],                r3w
+    shr             r3,                  16
+    mov             [r0 + r1 * 8],       r3w
+
+    movq            r3,                  xm3
+    pshufd          xm3,                 xm3, 0xEE
+    mov             [r4 + r1 * 2],       r3w
+    shr             r3,                  16
+    mov             [r0 + r7 * 2],       r3w
+    shr             r3,                  16
+    mov             [r4 + r1 * 4],       r3w
+    shr             r3,                  16
+    mov             [r0 + r6 * 4],       r3w
+    movq            r3,                  xm3
+    mov             [r4 + r6 * 2],       r3w
+    shr             r3,                  16
+    mov             [r4 + r8],           r3w


More information about the x265-commits mailing list