[x265-commits] [x265] asm: added code for blockcopy_ss and cleaned up asm primi...

Murugan Vairavel murugan at multicorewareinc.com
Fri Mar 7 22:34:48 CET 2014


details:   http://hg.videolan.org/x265/rev/2bf727dca27d
branches:  
changeset: 6419:2bf727dca27d
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Fri Mar 07 15:11:13 2014 +0530
description:
asm: added code for blockcopy_ss and cleaned up asm primitives of blockcopy

diffstat:

 source/common/x86/asm-primitives.cpp |   197 ++-
 source/common/x86/blockcopy8.asm     |  1647 ++++++++++++++++++---------------
 source/common/x86/blockcopy8.h       |   203 +--
 3 files changed, 1100 insertions(+), 947 deletions(-)

diffs (truncated from 2526 to 300 lines):

diff -r 33b67a53b6de -r 2bf727dca27d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Mar 06 21:27:55 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp	Fri Mar 07 15:11:13 2014 +0530
@@ -400,8 +400,7 @@ extern "C" {
     p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
     p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
     p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
-    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
-    p.luma_copy_ps[LUMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu;
+    p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu;
 #endif
 
 #define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \
@@ -414,38 +413,91 @@ extern "C" {
 #define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
     p.luma_vss[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu;
 
-#define SETUP_LUMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
-    p.luma_copy_pp[LUMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ## x ## H ## cpu;
+#define SETUP_LUMA_BLOCKCOPY(type, W, H, cpu) \
+    p.luma_copy_ ## type[LUMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
 
-#define SETUP_CHROMA_FROM_LUMA(W1, H1, W2, H2, cpu) \
-    p.chroma[X265_CSP_I420].copy_pp[LUMA_ ## W1 ## x ## H1] = x265_blockcopy_pp_ ## W2 ## x ## H2 ## cpu;
+#define SETUP_CHROMA_BLOCKCOPY(type, W, H, cpu) \
+    p.chroma[X265_CSP_I420].copy_ ## type[CHROMA_ ## W ## x ## H] = x265_blockcopy_ ## type ##_ ## W ## x ## H ## cpu;
 
-// For X265_CSP_I420 chroma width and height will be half of luma width and height
-#define CHROMA_BLOCKCOPY(cpu) \
-    SETUP_CHROMA_FROM_LUMA(8,   8, 4,  4,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(8,   4, 4,  2,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(4,   8, 2,  4,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(16, 16, 8,  8,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(16,  8, 8,  4,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(8,  16, 4,  8,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(16, 12, 8,  6,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(12, 16, 6,  8,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(16,  4, 8,  2,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(4,  16, 2,  8,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(32, 32, 16, 16, cpu); \
-    SETUP_CHROMA_FROM_LUMA(32, 16, 16, 8,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(16, 32, 8,  16, cpu); \
-    SETUP_CHROMA_FROM_LUMA(32, 24, 16, 12, cpu); \
-    SETUP_CHROMA_FROM_LUMA(24, 32, 12, 16, cpu); \
-    SETUP_CHROMA_FROM_LUMA(32,  8, 16, 4,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(8,  32, 4,  16, cpu); \
-    SETUP_CHROMA_FROM_LUMA(64, 64, 32, 32, cpu); \
-    SETUP_CHROMA_FROM_LUMA(64, 32, 32, 16, cpu); \
-    SETUP_CHROMA_FROM_LUMA(32, 64, 16, 32, cpu); \
-    SETUP_CHROMA_FROM_LUMA(64, 48, 32, 24, cpu); \
-    SETUP_CHROMA_FROM_LUMA(48, 64, 24, 32, cpu); \
-    SETUP_CHROMA_FROM_LUMA(64, 16, 32, 8,  cpu); \
-    SETUP_CHROMA_FROM_LUMA(16, 64, 8,  32, cpu);
+#define CHROMA_BLOCKCOPY(type ,cpu) \
+    SETUP_CHROMA_BLOCKCOPY(type, 2,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 2,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 4,  2,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 4,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 4,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 4,  16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 6,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 8,  2,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 8,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 8,  6,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 8,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 8,  16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 8,  32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 12, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 16, 4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 16, 8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 16, 12, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 16, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 16, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 24, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 32, 8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 32, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 32, 24, cpu); \
+    SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu);
+
+#define LUMA_BLOCKCOPY(type, cpu) \
+    SETUP_LUMA_BLOCKCOPY(type, 4,   4, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 8,   8, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 8,   4, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 4,   8, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 16, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 16,  8, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 8,  16, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 16, 12, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 12, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 16,  4, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 4,  16, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 32, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 32, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 16, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 32, 24, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 24, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 32,  8, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 8,  32, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 64, 64, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 64, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 32, 64, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 64, 48, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 48, 64, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 64, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY(type, 16, 64, cpu);
+
+#define SETUP_CHROMA_BLOCKCOPY_SP(W, H, cpu) \
+    p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_BLOCKCOPY_SP(cpu) \
+    SETUP_CHROMA_BLOCKCOPY_SP(4,  2,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(4,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(4,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(4,  16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(8,  2,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(8,  4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(8,  6,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(8,  8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(8,  16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(8,  32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(12, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(16, 4,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(16, 8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(16, 12, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(16, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(16, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(24, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(32, 8,  cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(32, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(32, 24, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu);
+
 
 #define SETUP_CHROMA_LUMA(W1, H1, W2, H2, cpu) \
     p.chroma[X265_CSP_I420].sub_ps[LUMA_ ## W1 ## x ## H1] = x265_pixel_sub_ps_ ## W2 ## x ## H2 ## cpu; \
@@ -585,33 +637,6 @@ extern "C" {
     SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \
     SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
 
-#define LUMA_BLOCKCOPY(cpu) \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4,   4, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8,   8, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8,   4, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4,   8, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 16, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16,  8, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8,  16, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 12, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(12, 16, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16,  4, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4,  16, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 32, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 16, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 32, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 24, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(24, 32, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32,  8, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8,  32, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 64, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 32, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 64, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 48, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(48, 64, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
-    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
-
 #define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
     p.var[BLOCK_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
 
@@ -963,8 +988,8 @@ void Setup_Assembly_Primitives(EncoderPr
         CHROMA_PIXELSUB_PS(_sse2);
         LUMA_PIXELSUB(_sse2);
 
-        CHROMA_BLOCKCOPY(_sse2);
-        LUMA_BLOCKCOPY(_sse2);
+        CHROMA_BLOCKCOPY(ss, _sse2);
+        LUMA_BLOCKCOPY(ss, _sse2);
 
         CHROMA_VERT_FILTERS(_sse2);
         p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
@@ -1043,14 +1068,16 @@ void Setup_Assembly_Primitives(EncoderPr
 
     for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
     {
-        p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_pp[i];
-        p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_pp[i];
+        p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_ss[i];
+        p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_ss[i];
+        p.luma_copy_pp[i] = (copy_pp_t)p.luma_copy_ss[i];
     }
 
     for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
     {
-        p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_pp[i];
-        p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_pp[i];
+        p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_ss[i];
+        p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_ss[i];
+        p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i];
     }
 
 #else // if HIGH_BIT_DEPTH
@@ -1096,8 +1123,12 @@ void Setup_Assembly_Primitives(EncoderPr
         INIT2(sad_x4, _sse2);
         HEVC_SATD(sse2);
 
-        CHROMA_BLOCKCOPY(_sse2);
-        LUMA_BLOCKCOPY(_sse2);
+        CHROMA_BLOCKCOPY(ss, _sse2);
+        CHROMA_BLOCKCOPY(pp, _sse2);
+        LUMA_BLOCKCOPY(ss, _sse2);
+        LUMA_BLOCKCOPY(pp, _sse2);
+        LUMA_BLOCKCOPY(sp, _sse2);
+        CHROMA_BLOCKCOPY_SP(_sse2);
 
         CHROMA_SS_FILTERS_420(_sse2);
         CHROMA_SS_FILTERS_444(_sse2);
@@ -1110,34 +1141,6 @@ void Setup_Assembly_Primitives(EncoderPr
         // until all partitions are coded and commit smaller patches, easier to
         // review.
 
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_4x2] = x265_blockcopy_sp_4x2_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_4x4] = x265_blockcopy_sp_4x4_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_4x8] = x265_blockcopy_sp_4x8_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_4x16] = x265_blockcopy_sp_4x16_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x2] = x265_blockcopy_sp_8x2_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x4] = x265_blockcopy_sp_8x4_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x6] = x265_blockcopy_sp_8x6_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x8] = x265_blockcopy_sp_8x8_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x16] = x265_blockcopy_sp_8x16_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_12x16] = x265_blockcopy_sp_12x16_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x4] = x265_blockcopy_sp_16x4_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x8] = x265_blockcopy_sp_16x8_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x12] = x265_blockcopy_sp_16x12_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x16] = x265_blockcopy_sp_16x16_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x32] = x265_blockcopy_sp_16x32_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_24x32] = x265_blockcopy_sp_24x32_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_32x8] = x265_blockcopy_sp_32x8_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_32x16] = x265_blockcopy_sp_32x16_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_32x24] = x265_blockcopy_sp_32x24_sse2;
-        p.chroma[X265_CSP_I420].copy_sp[CHROMA_32x32] = x265_blockcopy_sp_32x32_sse2;
-
-        p.luma_copy_sp[LUMA_32x64] = x265_blockcopy_sp_32x64_sse2;
-        p.luma_copy_sp[LUMA_16x64] = x265_blockcopy_sp_16x64_sse2;
-        p.luma_copy_sp[LUMA_48x64] = x265_blockcopy_sp_48x64_sse2;
-        p.luma_copy_sp[LUMA_64x16] = x265_blockcopy_sp_64x16_sse2;
-        p.luma_copy_sp[LUMA_64x32] = x265_blockcopy_sp_64x32_sse2;
-        p.luma_copy_sp[LUMA_64x48] = x265_blockcopy_sp_64x48_sse2;
-        p.luma_copy_sp[LUMA_64x64] = x265_blockcopy_sp_64x64_sse2;
         p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
         p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
         p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
@@ -1227,9 +1230,12 @@ void Setup_Assembly_Primitives(EncoderPr
         LUMA_SP_FILTERS(_sse4);
         LUMA_FILTERS(_sse4);
         ASSGN_SSE_SS(sse4);
+
         p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
         p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
         p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
+        CHROMA_BLOCKCOPY(ps, _sse4);
+        LUMA_BLOCKCOPY(ps, _sse4);
 
         p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x4] = x265_interp_4tap_vert_sp_2x4_sse4;
         p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x8] = x265_interp_4tap_vert_sp_2x8_sse4;
@@ -1305,6 +1311,7 @@ void Setup_Assembly_Primitives(EncoderPr
         p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
         p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
         p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
+        p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
         p.chroma[X265_CSP_I444].add_ps[i]  = p.luma_add_ps[i];
         p.chroma[X265_CSP_I444].sub_ps[i]  = p.luma_sub_ps[i];
         p.chroma[X265_CSP_I444].addAvg[i]  = p.luma_addAvg[i];
diff -r 33b67a53b6de -r 2bf727dca27d source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Thu Mar 06 21:27:55 2014 -0600
+++ b/source/common/x86/blockcopy8.asm	Fri Mar 07 15:11:13 2014 +0530
@@ -35,22 +35,7 @@ SECTION .text
 ; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal blockcopy_pp_2x4, 4, 7, 0, dest, deststride, src, srcstride
-%if HIGH_BIT_DEPTH
-    add    r1,     r1
-    add    r3,     r3
-    mov    r4d,    [r2]
-    mov    r5d,    [r2 + r3]
-    lea    r2,     [r2 + r3 * 2]
-    mov    r6d,    [r2]
-    mov    r3d,    [r2 + r3]
-
-    mov    [r0],         r4d
-    mov    [r0 + r1],    r5d
-    lea    r0,           [r0 + 2 * r1]
-    mov    [r0],         r6d
-    mov    [r0 + r1],    r3d
-%else
+cglobal blockcopy_pp_2x4, 4, 7, 0
     mov    r4w,    [r2]
     mov    r5w,    [r2 + r3]
     lea    r2,     [r2 + r3 * 2]
@@ -62,43 +47,13 @@ cglobal blockcopy_pp_2x4, 4, 7, 0, dest,


More information about the x265-commits mailing list