[x265-commits] [x265] asm: intra_pred_ang4_2_sse2 16-bit
David T Yuen
dtyx265 at gmail.com
Sun Apr 5 22:07:55 CEST 2015
details: http://hg.videolan.org/x265/rev/8b33482063b6
branches:
changeset: 10045:8b33482063b6
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 07:56:12 2015 -0700
description:
asm: intra_pred_ang4_2_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 2\]"
intra_ang_4x4[ 2] 8.76x 142.46 1248.07
Subject: [x265] asm: intra_pred_ang4_3_sse2 16-bit
details: http://hg.videolan.org/x265/rev/dff81bfb03b2
branches:
changeset: 10046:dff81bfb03b2
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:18:32 2015 -0700
description:
asm: intra_pred_ang4_3_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 3\]"
intra_ang_4x4[ 3] 3.65x 517.65 1888.27
transposed mode 33
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[33\]"
intra_ang_4x4[33] 3.30x 412.82 1360.41
Subject: [x265] asm: intra_pred_ang4_4_sse2 16-bit
details: http://hg.videolan.org/x265/rev/22237958e739
branches:
changeset: 10047:22237958e739
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:24:22 2015 -0700
description:
asm: intra_pred_ang4_4_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 4\]"
intra_ang_4x4[ 4] 4.29x 475.00 2037.57
transposed mode 32
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[32\]"
intra_ang_4x4[32] 3.82x 394.97 1510.33
Subject: [x265] asm: intra_pred_ang4_5_sse2 16-bit
details: http://hg.videolan.org/x265/rev/303e667be3a1
branches:
changeset: 10048:303e667be3a1
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:28:05 2015 -0700
description:
asm: intra_pred_ang4_5_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 5\]"
intra_ang_4x4[ 5] 4.10x 497.50 2037.46
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[31\]"
intra_ang_4x4[31] 3.82x 394.97 1510.36
Subject: [x265] asm: intra_pred_ang4_6_sse2 16-bit
details: http://hg.videolan.org/x265/rev/2f61369d17ac
branches:
changeset: 10049:2f61369d17ac
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:31:18 2015 -0700
description:
asm: intra_pred_ang4_6_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 6\]"
intra_ang_4x4[ 6] 4.63x 442.50 2047.46
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[30\]"
intra_ang_4x4[30] 4.20x 360.02 1512.88
Subject: [x265] asm: intra_pred_ang4_7_sse2 16-bit
details: http://hg.videolan.org/x265/rev/fefaa22f64e9
branches:
changeset: 10050:fefaa22f64e9
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:34:09 2015 -0700
description:
asm: intra_pred_ang4_7_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 7\]"
intra_ang_4x4[ 7] 4.08x 465.00 1894.96
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[29\]"
intra_ang_4x4[29] 3.60x 377.48 1360.65
Subject: [x265] asm: intra_pred_ang4_8_sse2 16-bit
details: http://hg.videolan.org/x265/rev/1bb0face473b
branches:
changeset: 10051:1bb0face473b
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:37:03 2015 -0700
description:
asm: intra_pred_ang4_8_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 8\]"
intra_ang_4x4[ 8] 4.34x 460.13 1994.96
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[28\]"
intra_ang_4x4[28] 3.63x 402.73 1461.14
Subject: [x265] asm: intra_pred_ang4_9_sse2 16-bit
details: http://hg.videolan.org/x265/rev/69ac3280d8e8
branches:
changeset: 10052:69ac3280d8e8
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:40:26 2015 -0700
description:
asm: intra_pred_ang4_9_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 9\]"
intra_ang_4x4[ 9] 4.33x 457.50 1982.47
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[27\]"
intra_ang_4x4[27] 3.61x 402.54 1453.19
Subject: [x265] asm: intra_pred_ang4_10_sse2 16-bit
details: http://hg.videolan.org/x265/rev/8370818534eb
branches:
changeset: 10053:8370818534eb
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:46:08 2015 -0700
description:
asm: intra_pred_ang4_10_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[10\]"
intra_ang_4x4[10] 6.40x 197.60 1264.07
Subject: [x265] asm: intra_pred_ang4_26_sse2 16-bit
details: http://hg.videolan.org/x265/rev/15aff00db638
branches:
changeset: 10054:15aff00db638
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:50:10 2015 -0700
description:
asm: intra_pred_ang4_26_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[26\]"
intra_ang_4x4[26] 3.76x 200.35 754.13
Subject: [x265] asm: intra_pred_ang4_11_sse2 16-bit
details: http://hg.videolan.org/x265/rev/b532a1ee8ac0
branches:
changeset: 10055:b532a1ee8ac0
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:53:42 2015 -0700
description:
asm: intra_pred_ang4_11_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[11\]"
intra_ang_4x4[11] 4.19x 462.50 1938.83
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[25\]"
intra_ang_4x4[25] 3.80x 384.97 1462.87
Subject: [x265] asm: intra_pred_ang4_12_sse2 16-bit
details: http://hg.videolan.org/x265/rev/4af86806ec56
branches:
changeset: 10056:4af86806ec56
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 08:56:27 2015 -0700
description:
asm: intra_pred_ang4_12_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[12\]"
intra_ang_4x4[12] 4.40x 462.50 2032.77
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[24\]"
intra_ang_4x4[24] 3.76x 402.72 1514.26
Subject: [x265] asm: intra_pred_ang4_13_sse2 16-bit
details: http://hg.videolan.org/x265/rev/9ee4868f8206
branches:
changeset: 10057:9ee4868f8206
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 09:00:07 2015 -0700
description:
asm: intra_pred_ang4_13_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[13\]"
intra_ang_4x4[13] 4.11x 525.09 2155.55
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[23\]"
intra_ang_4x4[23] 3.53x 460.18 1623.95
Subject: [x265] asm: intra_pred_ang4_14_sse2 16-bit
details: http://hg.videolan.org/x265/rev/52d41e99a056
branches:
changeset: 10058:52d41e99a056
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 09:02:38 2015 -0700
description:
asm: intra_pred_ang4_14_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[14\]"
intra_ang_4x4[14] 4.25x 504.99 2147.57
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[22\]"
intra_ang_4x4[22] 3.79x 442.58 1675.51
Subject: [x265] asm: intra_pred_ang4_15_sse2 16-bit
details: http://hg.videolan.org/x265/rev/7a58f674172b
branches:
changeset: 10059:7a58f674172b
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 09:05:30 2015 -0700
description:
asm: intra_pred_ang4_15_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[15\]"
intra_ang_4x4[15] 4.12x 502.63 2073.01
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[21\]"
intra_ang_4x4[21] 3.80x 425.10 1616.30
Subject: [x265] asm: intra_pred_ang4_16_sse2 16-bit
details: http://hg.videolan.org/x265/rev/ce56052e5afd
branches:
changeset: 10060:ce56052e5afd
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 09:09:04 2015 -0700
description:
asm: intra_pred_ang4_16_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[16\]"
intra_ang_4x4[16] 4.30x 502.59 2162.63
transposed mode
/test/TestBench --testbench intrapred | grep "intra_ang_4x4\[20\]"
intra_ang_4x4[20] 4.03x 420.12 1693.37
Subject: [x265] asm: intra_pred_ang4_17_sse2 16-bit
details: http://hg.videolan.org/x265/rev/bbbee479041b
branches:
changeset: 10061:bbbee479041b
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 09:20:42 2015 -0700
description:
asm: intra_pred_ang4_17_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[17\]"
intra_ang_4x4[17] 3.80x 580.18 2202.73
transposed mode
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[19\]"
intra_ang_4x4[19] 3.39x 512.79 1735.90
Subject: [x265] asm: intra_pred_ang4_18_sse2 16-bit
details: http://hg.videolan.org/x265/rev/562ce7cbd673
branches:
changeset: 10062:562ce7cbd673
user: David T Yuen <dtyx265 at gmail.com>
date: Fri Apr 03 09:22:47 2015 -0700
description:
asm: intra_pred_ang4_18_sse2 16-bit
This is backported from sse4 code and replaces c code.
./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[18\]"
intra_ang_4x4[18] 8.54x 127.48 1088.34
Subject: [x265] cli: move raw bitstream output to separate file
details: http://hg.videolan.org/x265/rev/4433f6cf89c7
branches:
changeset: 10063:4433f6cf89c7
user: Xinyue Lu <i at 7086.in>
date: Fri Apr 03 16:22:38 2015 -0700
description:
cli: move raw bitstream output to separate file
Timebase and PTS are passed to output module
Subject: [x265] encoder: do not disable the thread pool if lookahead-slices is enabled
details: http://hg.videolan.org/x265/rev/fa6451ec733a
branches:
changeset: 10064:fa6451ec733a
user: Steve Borho <steve at borho.org>
date: Fri Apr 03 22:39:23 2015 -0500
description:
encoder: do not disable the thread pool if lookahead-slices is enabled
The user is asking for a pool feature, give them a thread-pool.
This fixes a regression test which used --no-wpp --lookahead-slices 2. By
default this had no thread pool and so lookahead-slices 2 was ignored. But if
the --pme spot-check was randomly selected, it would re-enable the pool and
re-enable lookahead-slices and the outputs would change. After this change,
the lookahead slices prevent --no-wpp from disabling the pool itself.
Subject: [x265] threading: intrdoduce poke() method for ThreadSafeInteger
details: http://hg.videolan.org/x265/rev/1d8ce4e9eb13
branches: stable
changeset: 10065:1d8ce4e9eb13
user: Steve Borho <steve at borho.org>
date: Sat Apr 04 14:04:14 2015 -0500
description:
threading: intrdoduce poke() method for ThreadSafeInteger
Sometimes we need to waken all threads that are blocked on a TSI,
but do not want to change its value (we want the blocked threads to
re-check other state variables). We were using set(get()) for this
but that is a race hazard. We've seen one deadlock in the vicinity
of this code and so are removing this hazard even though we are not
certain this was the root cause of the deadlock.
Subject: [x265] Merge with stable
details: http://hg.videolan.org/x265/rev/e0523096bb21
branches:
changeset: 10066:e0523096bb21
user: Steve Borho <steve at borho.org>
date: Sat Apr 04 15:06:40 2015 -0400
description:
Merge with stable
Subject: [x265] asm: intra_pred4_x filtering
details: http://hg.videolan.org/x265/rev/fa7802c5d94f
branches:
changeset: 10067:fa7802c5d94f
user: David T Yuen <dtyx265 at gmail.com>
date: Sat Apr 04 11:42:34 2015 -0700
description:
asm: intra_pred4_x filtering
Use r4 to hold address of constant to reduce code size
Subject: [x265] asm: intra_pred_ang4_26_sse2
details: http://hg.videolan.org/x265/rev/ea9d4f255b91
branches:
changeset: 10068:ea9d4f255b91
user: David T Yuen <dtyx265 at gmail.com>
date: Sat Apr 04 11:55:13 2015 -0700
description:
asm: intra_pred_ang4_26_sse2
changed r1 to r1d to reduce code size
Subject: [x265] asm: intra_pred_ang4_18
details: http://hg.videolan.org/x265/rev/fca904380abf
branches:
changeset: 10069:fca904380abf
user: David T Yuen <dtyx265 at gmail.com>
date: Sat Apr 04 12:00:59 2015 -0700
description:
asm: intra_pred_ang4_18
Changed third pshuflw parameter from hexadecimal to quaternary
The value is the unchanged so this patch is strictly cosmetic and therefore optional
Subject: [x265] cli: tweak output file logging
details: http://hg.videolan.org/x265/rev/ebe5e57c4b45
branches:
changeset: 10070:ebe5e57c4b45
user: Steve Borho <steve at borho.org>
date: Sat Apr 04 15:11:39 2015 -0500
description:
cli: tweak output file logging
Prefix log messages for raw output bitstreams with 'raw', presumably MKV or MP4
muxers would use "mkv" or "mp4"
diffstat:
doc/reST/cli.rst | 8 +-
doc/reST/threading.rst | 1 +
source/common/threading.h | 16 +
source/common/x86/asm-primitives.cpp | 33 ++
source/common/x86/intrapred16.asm | 502 +++++++++++++++++++++++++++++++++++
source/encoder/encoder.cpp | 6 +-
source/encoder/ratecontrol.cpp | 4 +-
source/input/input.h | 2 +
source/output/output.cpp | 10 +-
source/output/output.h | 33 ++-
source/output/raw.cpp | 77 +++++
source/output/raw.h | 64 ++++
source/x265.cpp | 83 +++-
13 files changed, 803 insertions(+), 36 deletions(-)
diffs (truncated from 1128 to 300 lines):
diff -r 335c728bbd62 -r ebe5e57c4b45 doc/reST/cli.rst
--- a/doc/reST/cli.rst Fri Apr 03 14:27:32 2015 -0500
+++ b/doc/reST/cli.rst Sat Apr 04 15:11:39 2015 -0500
@@ -201,11 +201,11 @@ Performance Options
their node, they will not be allowed to migrate between nodes, but they
will be allowed to move between CPU cores within their node.
- If the three pool features: :option:`--wpp` :option:`--pmode` and
- :option:`--pme` are all disabled, then :option:`--pools` is ignored
- and no thread pools are created.
+ If the four pool features: :option:`--wpp`, :option:`--pmode`,
+ :option:`--pme` and :option:`--lookahead-slices` are all disabled,
+ then :option:`--pools` is ignored and no thread pools are created.
- If "none" is specified, then all three of the thread pool features are
+ If "none" is specified, then all four of the thread pool features are
implicitly disabled.
Multiple thread pools will be allocated for any NUMA node with more than
diff -r 335c728bbd62 -r ebe5e57c4b45 doc/reST/threading.rst
--- a/doc/reST/threading.rst Fri Apr 03 14:27:32 2015 -0500
+++ b/doc/reST/threading.rst Sat Apr 04 15:11:39 2015 -0500
@@ -225,6 +225,7 @@ scene cuts and slice types) uses the thr
lowres cost analysis to worker threads. It will use bonded task groups
to perform batches of frame cost estimates, and it may optionally use
bonded task groups to measure single frame cost estimates using slices.
+(see :option:`--lookahead-slices`)
The function slicetypeDecide() itself is also be performed by a worker
thread if your encoder has a thread pool, else it runs within the
diff -r 335c728bbd62 -r ebe5e57c4b45 source/common/threading.h
--- a/source/common/threading.h Fri Apr 03 14:27:32 2015 -0500
+++ b/source/common/threading.h Sat Apr 04 15:11:39 2015 -0500
@@ -189,6 +189,14 @@ public:
LeaveCriticalSection(&m_cs);
}
+ void poke(void)
+ {
+ /* awaken all waiting threads, but make no change */
+ EnterCriticalSection(&m_cs);
+ WakeAllConditionVariable(&m_cv);
+ LeaveCriticalSection(&m_cs);
+ }
+
void incr()
{
EnterCriticalSection(&m_cs);
@@ -370,6 +378,14 @@ public:
pthread_mutex_unlock(&m_mutex);
}
+ void poke(void)
+ {
+ /* awaken all waiting threads, but make no change */
+ pthread_mutex_lock(&m_mutex);
+ pthread_cond_broadcast(&m_cond);
+ pthread_mutex_unlock(&m_mutex);
+ }
+
void incr()
{
pthread_mutex_lock(&m_mutex);
diff -r 335c728bbd62 -r ebe5e57c4b45 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Apr 03 14:27:32 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Sat Apr 04 15:11:39 2015 -0500
@@ -879,6 +879,39 @@ void setupAssemblyPrimitives(EncoderPrim
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_sse2;
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_sse2;
+ p.cu[BLOCK_4x4].intra_pred[2] = x265_intra_pred_ang4_2_sse2;
+ p.cu[BLOCK_4x4].intra_pred[3] = x265_intra_pred_ang4_3_sse2;
+ p.cu[BLOCK_4x4].intra_pred[4] = x265_intra_pred_ang4_4_sse2;
+ p.cu[BLOCK_4x4].intra_pred[5] = x265_intra_pred_ang4_5_sse2;
+ p.cu[BLOCK_4x4].intra_pred[6] = x265_intra_pred_ang4_6_sse2;
+ p.cu[BLOCK_4x4].intra_pred[7] = x265_intra_pred_ang4_7_sse2;
+ p.cu[BLOCK_4x4].intra_pred[8] = x265_intra_pred_ang4_8_sse2;
+ p.cu[BLOCK_4x4].intra_pred[9] = x265_intra_pred_ang4_9_sse2;
+ p.cu[BLOCK_4x4].intra_pred[10] = x265_intra_pred_ang4_10_sse2;
+ p.cu[BLOCK_4x4].intra_pred[11] = x265_intra_pred_ang4_11_sse2;
+ p.cu[BLOCK_4x4].intra_pred[12] = x265_intra_pred_ang4_12_sse2;
+ p.cu[BLOCK_4x4].intra_pred[13] = x265_intra_pred_ang4_13_sse2;
+ p.cu[BLOCK_4x4].intra_pred[14] = x265_intra_pred_ang4_14_sse2;
+ p.cu[BLOCK_4x4].intra_pred[15] = x265_intra_pred_ang4_15_sse2;
+ p.cu[BLOCK_4x4].intra_pred[16] = x265_intra_pred_ang4_16_sse2;
+ p.cu[BLOCK_4x4].intra_pred[17] = x265_intra_pred_ang4_17_sse2;
+ p.cu[BLOCK_4x4].intra_pred[18] = x265_intra_pred_ang4_18_sse2;
+ p.cu[BLOCK_4x4].intra_pred[19] = x265_intra_pred_ang4_17_sse2;
+ p.cu[BLOCK_4x4].intra_pred[20] = x265_intra_pred_ang4_16_sse2;
+ p.cu[BLOCK_4x4].intra_pred[21] = x265_intra_pred_ang4_15_sse2;
+ p.cu[BLOCK_4x4].intra_pred[22] = x265_intra_pred_ang4_14_sse2;
+ p.cu[BLOCK_4x4].intra_pred[23] = x265_intra_pred_ang4_13_sse2;
+ p.cu[BLOCK_4x4].intra_pred[24] = x265_intra_pred_ang4_12_sse2;
+ p.cu[BLOCK_4x4].intra_pred[25] = x265_intra_pred_ang4_11_sse2;
+ p.cu[BLOCK_4x4].intra_pred[26] = x265_intra_pred_ang4_26_sse2;
+ p.cu[BLOCK_4x4].intra_pred[27] = x265_intra_pred_ang4_9_sse2;
+ p.cu[BLOCK_4x4].intra_pred[28] = x265_intra_pred_ang4_8_sse2;
+ p.cu[BLOCK_4x4].intra_pred[29] = x265_intra_pred_ang4_7_sse2;
+ p.cu[BLOCK_4x4].intra_pred[30] = x265_intra_pred_ang4_6_sse2;
+ p.cu[BLOCK_4x4].intra_pred[31] = x265_intra_pred_ang4_5_sse2;
+ p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
+ p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
+
p.cu[BLOCK_4x4].sse_ss = x265_pixel_ssd_ss_4x4_mmx2;
ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
diff -r 335c728bbd62 -r ebe5e57c4b45 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Fri Apr 03 14:27:32 2015 -0500
+++ b/source/common/x86/intrapred16.asm Sat Apr 04 15:11:39 2015 -0500
@@ -690,6 +690,508 @@ cglobal intra_pred_planar32, 3,3,16
%endrep
RET
+;-----------------------------------------------------------------------------------------
+; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal intra_pred_ang4_2, 3,5,4
+ lea r4, [r2 + 4]
+ add r2, 20
+ cmp r3m, byte 34
+ cmove r2, r4
+
+ add r1, r1
+ movu m0, [r2]
+ movh [r0], m0
+ psrldq m0, 2
+ movh [r0 + r1], m0
+ psrldq m0, 2
+ movh [r0 + r1 * 2], m0
+ lea r1, [r1 * 3]
+ psrldq m0, 2
+ movh [r0 + r1], m0
+ RET
+
+cglobal intra_pred_ang4_3, 3,5,8
+ mov r4d, 2
+ cmp r3m, byte 33
+ mov r3d, 18
+ cmove r3d, r4d
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
+
+ mova m2, m0
+ psrldq m0, 2
+ punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
+ mova m3, m0
+ psrldq m0, 2
+ punpcklwd m3, m0 ; [6 5 5 4 4 3 3 2]
+ mova m4, m0
+ psrldq m0, 2
+ punpcklwd m4, m0 ; [7 6 6 5 5 4 4 3]
+ mova m5, m0
+ psrldq m0, 2
+ punpcklwd m5, m0 ; [8 7 7 6 6 5 5 4]
+
+
+ lea r3, [ang_table + 20 * 16]
+ mova m0, [r3 + 6 * 16] ; [26]
+ mova m1, [r3] ; [20]
+ mova m6, [r3 - 6 * 16] ; [14]
+ mova m7, [r3 - 12 * 16] ; [ 8]
+ jmp .do_filter4x4
+
+
+ALIGN 16
+.do_filter4x4:
+ lea r4, [pd_16]
+ pmaddwd m2, m0
+ paddd m2, [r4]
+ psrld m2, 5
+
+ pmaddwd m3, m1
+ paddd m3, [r4]
+ psrld m3, 5
+ packssdw m2, m3
+
+ pmaddwd m4, m6
+ paddd m4, [r4]
+ psrld m4, 5
+
+ pmaddwd m5, m7
+ paddd m5, [r4]
+ psrld m5, 5
+ packssdw m4, m5
+
+ jz .store
+
+ ; transpose 4x4
+ punpckhwd m0, m2, m4
+ punpcklwd m2, m4
+ punpckhwd m4, m2, m0
+ punpcklwd m2, m0
+
+.store:
+ add r1, r1
+ movh [r0], m2
+ movhps [r0 + r1], m2
+ movh [r0 + r1 * 2], m4
+ lea r1, [r1 * 3]
+ movhps [r0 + r1], m4
+ RET
+
+cglobal intra_pred_ang4_4, 3,5,8
+ mov r4d, 2
+ cmp r3m, byte 32
+ mov r3d, 18
+ cmove r3d, r4d
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
+ mova m2, m0
+ psrldq m0, 2
+ punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
+ mova m3, m0
+ psrldq m0, 2
+ punpcklwd m3, m0 ; [6 5 5 4 4 3 3 2]
+ mova m4, m3
+ mova m5, m0
+ psrldq m0, 2
+ punpcklwd m5, m0 ; [7 6 6 5 5 4 4 3]
+
+ lea r3, [ang_table + 18 * 16]
+ mova m0, [r3 + 3 * 16] ; [21]
+ mova m1, [r3 - 8 * 16] ; [10]
+ mova m6, [r3 + 13 * 16] ; [31]
+ mova m7, [r3 + 2 * 16] ; [20]
+ jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_5, 3,5,8
+ mov r4d, 2
+ cmp r3m, byte 31
+ mov r3d, 18
+ cmove r3d, r4d
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
+ mova m2, m0
+ psrldq m0, 2
+ punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
+ mova m3, m0
+ psrldq m0, 2
+ punpcklwd m3, m0 ; [6 5 5 4 4 3 3 2]
+ mova m4, m3
+ mova m5, m0
+ psrldq m0, 2
+ punpcklwd m5, m0 ; [7 6 6 5 5 4 4 3]
+
+ lea r3, [ang_table + 10 * 16]
+ mova m0, [r3 + 7 * 16] ; [17]
+ mova m1, [r3 - 8 * 16] ; [ 2]
+ mova m6, [r3 + 9 * 16] ; [19]
+ mova m7, [r3 - 6 * 16] ; [ 4]
+ jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_6, 3,5,8
+ mov r4d, 2
+ cmp r3m, byte 30
+ mov r3d, 18
+ cmove r3d, r4d
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
+ mova m2, m0
+ psrldq m0, 2
+ punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
+ mova m3, m2
+ mova m4, m0
+ psrldq m0, 2
+ punpcklwd m4, m0 ; [6 5 5 4 4 3 3 2]
+ mova m5, m4
+
+ lea r3, [ang_table + 19 * 16]
+ mova m0, [r3 - 6 * 16] ; [13]
+ mova m1, [r3 + 7 * 16] ; [26]
+ mova m6, [r3 - 12 * 16] ; [ 7]
+ mova m7, [r3 + 1 * 16] ; [20]
+ jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_7, 3,5,8
+ mov r4d, 2
+ cmp r3m, byte 29
+ mov r3d, 18
+ cmove r3d, r4d
+
+ movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
+ mova m2, m0
+ psrldq m0, 2
+ punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
+ mova m3, m2
+ mova m4, m2
+ mova m5, m0
+ psrldq m0, 2
+ punpcklwd m5, m0 ; [6 5 5 4 4 3 3 2]
+
+ lea r3, [ang_table + 20 * 16]
+ mova m0, [r3 - 11 * 16] ; [ 9]
+ mova m1, [r3 - 2 * 16] ; [18]
+ mova m6, [r3 + 7 * 16] ; [27]
+ mova m7, [r3 - 16 * 16] ; [ 4]
+ jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_8, 3,5,8
More information about the x265-commits
mailing list