[x265-commits] [x265] asm: Assembly SSE2/AVX2 for planecopy_sp_shl
Min Chen
chenm003 at 163.com
Tue Jul 14 06:00:53 CEST 2015
details: http://hg.videolan.org/x265/rev/4d6157ca8d15
branches:
changeset: 10813:4d6157ca8d15
user: Min Chen <chenm003 at 163.com>
date: Mon Jul 13 17:37:57 2015 -0700
description:
asm: Assembly SSE2/AVX2 for planecopy_sp_shl
Subject: [x265] asm: fix Main12 bugs in sad_mmx2 & sad_sse2
details: http://hg.videolan.org/x265/rev/7d272c772caf
branches:
changeset: 10814:7d272c772caf
user: Min Chen <chenm003 at 163.com>
date: Mon Jul 13 17:37:59 2015 -0700
description:
asm: fix Main12 bugs in sad_mmx2 & sad_sse2
Subject: [x265] asm: enable SSE2 of sad[8x16]
details: http://hg.videolan.org/x265/rev/8023786c5247
branches:
changeset: 10815:8023786c5247
user: Min Chen <chenm003 at 163.com>
date: Mon Jul 13 17:38:02 2015 -0700
description:
asm: enable SSE2 of sad[8x16]
diffstat:
source/common/x86/asm-primitives.cpp | 8 +
source/common/x86/pixel-a.asm | 216 ++++++++++++++++++++++++++++++++++-
source/common/x86/pixel.h | 2 +
source/common/x86/sad16-a.asm | 62 ++++++---
source/test/pixelharness.cpp | 13 +-
5 files changed, 277 insertions(+), 24 deletions(-)
diffs (truncated from 481 to 300 lines):
diff -r fbfe1d0c586f -r 8023786c5247 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jul 13 18:00:30 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jul 13 17:38:02 2015 -0700
@@ -889,6 +889,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_sse2);
p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_sse2);
+ p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_sse2);
HEVC_SAD(sse2);
p.pu[LUMA_4x4].sad_x3 = PFX(pixel_sad_x3_4x4_mmx2);
@@ -1012,6 +1013,9 @@ void setupAssemblyPrimitives(EncoderPrim
LUMA_VSS_FILTERS(sse2);
p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
+ // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
+ //p.planecopy_sp = PFX(downShift_16_sse2);
+ p.planecopy_sp_shl = PFX(upShift_16_sse2);
ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
@@ -1292,6 +1296,10 @@ void setupAssemblyPrimitives(EncoderPrim
{
p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+ // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
+ //p.planecopy_sp = PFX(downShift_16_avx2);
+ p.planecopy_sp_shl = PFX(upShift_16_avx2);
+
p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
diff -r fbfe1d0c586f -r 8023786c5247 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Jul 13 18:00:30 2015 +0530
+++ b/source/common/x86/pixel-a.asm Mon Jul 13 17:38:02 2015 -0700
@@ -70,6 +70,7 @@ cextern popcnt_table
cextern pd_2
cextern hmul_16p
cextern pb_movemask
+cextern pw_pixel_max
;=============================================================================
; SATD
@@ -7092,7 +7093,7 @@ cglobal pixel_sa8d_32x32, 4,8,8
; Input 10bit, Output 8bit
;------------------------------------------------------------------------------------------------------------------------
-;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;------------------------------------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal downShift_16, 7,7,3
@@ -7466,6 +7467,219 @@ cglobal upShift_8, 6,7,4
%endif
%endmacro
+
+; Input 10bit, Output 12bit
+;------------------------------------------------------------------------------------------------------------------------
+;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+;------------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal upShift_16, 6,7,4
+ movd m0, r6m ; m0 = shift
+ mova m3, [pw_pixel_max]
+ FIX_STRIDES r1d, r3d
+ dec r5d
+.loopH:
+ xor r6d, r6d
+.loopW:
+ movu m1, [r0 + r6 * SIZEOF_PIXEL]
+ movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ ; TODO: if input always valid, we can remove below 2 instructions.
+ pand m1, m3
+ pand m2, m3
+ movu [r2 + r6 * SIZEOF_PIXEL], m1
+ movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
+
+ add r6, mmsize * 2 / SIZEOF_PIXEL
+ cmp r6d, r4d
+ jl .loopW
+
+ ; move to next row
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jnz .loopH
+
+;processing last row of every frame [To handle width which not a multiple of 16]
+
+.loop16:
+ movu m1, [r0]
+ movu m2, [r0 + mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ pand m1, m3
+ pand m2, m3
+ movu [r2], m1
+ movu [r2 + mmsize], m2
+
+ add r0, 2 * mmsize
+ add r2, 2 * mmsize
+ sub r4d, 16
+ jz .end
+ jg .loop16
+
+ cmp r4d, 8
+ jl .process4
+ movu m1, [r0]
+ psrlw m1, m0
+ pand m1, m3
+ movu [r2], m1
+
+ add r0, mmsize
+ add r2, mmsize
+ sub r4d, 8
+ jz .end
+
+.process4:
+ cmp r4d, 4
+ jl .process2
+ movh m1,[r0]
+ psllw m1, m0
+ pand m1, m3
+ movh [r2], m1
+
+ add r0, 8
+ add r2, 8
+ sub r4d, 4
+ jz .end
+
+.process2:
+ cmp r4d, 2
+ jl .process1
+ movd m1, [r0]
+ psllw m1, m0
+ pand m1, m3
+ movd [r2], m1
+
+ add r0, 4
+ add r2, 4
+ sub r4d, 2
+ jz .end
+
+.process1:
+ movd m1, [r0]
+ psllw m1, m0
+ pand m1, m3
+ movd r3, m1
+ mov [r2], r3w
+.end:
+ RET
+
+; Input 10bit, Output 12bit
+;-------------------------------------------------------------------------------------------------------------------------------------
+;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+;-------------------------------------------------------------------------------------------------------------------------------------
+; TODO: NO TEST CODE!
+INIT_YMM avx2
+cglobal upShift_16, 6,7,4
+ movd xm0, r6m ; m0 = shift
+ vbroadcasti128 m3, [pw_pixel_max]
+ FIX_STRIDES r1d, r3d
+ dec r5d
+.loopH:
+ xor r6d, r6d
+.loopW:
+ movu m1, [r0 + r6 * SIZEOF_PIXEL]
+ movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ pand m1, m3
+ pand m2, m3
+ movu [r2 + r6 * SIZEOF_PIXEL], m1
+ movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
+
+ add r6, mmsize * 2 / SIZEOF_PIXEL
+ cmp r6d, r4d
+ jl .loopW
+
+ ; move to next row
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jnz .loopH
+
+; processing last row of every frame [To handle width which not a multiple of 32]
+ mov r6d, r4d
+ and r4d, 31
+ shr r6d, 5
+
+.loop32:
+ movu m1, [r0]
+ movu m2, [r0 + mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ pand m1, m3
+ pand m2, m3
+ movu [r2], m1
+ movu [r2 + mmsize], m2
+
+ add r0, 2*mmsize
+ add r2, 2*mmsize
+ dec r6d
+ jnz .loop32
+
+ cmp r4d, 16
+ jl .process8
+ movu m1, [r0]
+ psllw m1, xm0
+ pand m1, m3
+ movu [r2], m1
+
+ add r0, mmsize
+ add r2, mmsize
+ sub r4d, 16
+ jz .end
+
+.process8:
+ cmp r4d, 8
+ jl .process4
+ movu xm1, [r0]
+ psllw xm1, xm0
+ pand xm1, xm3
+ movu [r2], xm1
+
+ add r0, 16
+ add r2, 16
+ sub r4d, 8
+ jz .end
+
+.process4:
+ cmp r4d, 4
+ jl .process2
+ movq xm1,[r0]
+ psllw xm1, xm0
+ pand xm1, xm3
+ movq [r2], xm1
+
+ add r0, 8
+ add r2, 8
+ sub r4d, 4
+ jz .end
+
+.process2:
+ cmp r4d, 2
+ jl .process1
+ movd xm1, [r0]
+ psllw xm1, xm0
+ pand xm1, xm3
+ movd [r2], xm1
+
+ add r0, 4
+ add r2, 4
+ sub r4d, 2
+ jz .end
+
+.process1:
+ movd xm1, [r0]
+ psllw xm1, xm0
+ pand xm1, xm3
+ movd r3d, xm1
+ mov [r2], r3w
+.end:
+ RET
+
+
;---------------------------------------------------------------------------------------------------------------------
;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
;---------------------------------------------------------------------------------------------------------------------
diff -r fbfe1d0c586f -r 8023786c5247 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Jul 13 18:00:30 2015 +0530
+++ b/source/common/x86/pixel.h Mon Jul 13 17:38:02 2015 -0700
@@ -30,6 +30,8 @@
void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
diff -r fbfe1d0c586f -r 8023786c5247 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Mon Jul 13 18:00:30 2015 +0530
+++ b/source/common/x86/sad16-a.asm Mon Jul 13 17:38:02 2015 -0700
@@ -6,6 +6,7 @@
;* Authors: Oskar Arvidsson <oskar at irock.se>
;* Henrik Gramner <henrik at gramner.com>
;* Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+;* Min Chen <chenm003 at 163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
@@ -51,8 +52,14 @@ cextern pw_1
lea r2, [r2+2*r3]
paddw m1, m2
paddw m3, m4
+ %if BIT_DEPTH <= 10
More information about the x265-commits
mailing list