[x265-commits] [x265] asm: intra_pred_ang4_2_sse2 16-bit

David T Yuen dtyx265 at gmail.com
Sun Apr 5 22:07:55 CEST 2015


details:   http://hg.videolan.org/x265/rev/8b33482063b6
branches:  
changeset: 10045:8b33482063b6
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 07:56:12 2015 -0700
description:
asm: intra_pred_ang4_2_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 2\]"
intra_ang_4x4[ 2]	8.76x 	 142.46   	 1248.07
Subject: [x265] asm: intra_pred_ang4_3_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/dff81bfb03b2
branches:  
changeset: 10046:dff81bfb03b2
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:18:32 2015 -0700
description:
asm: intra_pred_ang4_3_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 3\]"
intra_ang_4x4[ 3]	3.65x 	 517.65   	 1888.27

transposed mode 33

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[33\]"
intra_ang_4x4[33]	3.30x 	 412.82   	 1360.41
Subject: [x265] asm: intra_pred_ang4_4_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/22237958e739
branches:  
changeset: 10047:22237958e739
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:24:22 2015 -0700
description:
asm: intra_pred_ang4_4_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 4\]"
intra_ang_4x4[ 4]	4.29x 	 475.00   	 2037.57

transposed mode 32

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[32\]"
intra_ang_4x4[32]	3.82x 	 394.97   	 1510.33
Subject: [x265] asm: intra_pred_ang4_5_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/303e667be3a1
branches:  
changeset: 10048:303e667be3a1
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:28:05 2015 -0700
description:
asm: intra_pred_ang4_5_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 5\]"
intra_ang_4x4[ 5]	4.10x 	 497.50   	 2037.46

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[31\]"
intra_ang_4x4[31]	3.82x 	 394.97   	 1510.36
Subject: [x265] asm: intra_pred_ang4_6_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/2f61369d17ac
branches:  
changeset: 10049:2f61369d17ac
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:31:18 2015 -0700
description:
asm: intra_pred_ang4_6_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 6\]"
intra_ang_4x4[ 6]	4.63x 	 442.50   	 2047.46

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[30\]"
intra_ang_4x4[30]	4.20x 	 360.02   	 1512.88
Subject: [x265] asm: intra_pred_ang4_7_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/fefaa22f64e9
branches:  
changeset: 10050:fefaa22f64e9
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:34:09 2015 -0700
description:
asm: intra_pred_ang4_7_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 7\]"
intra_ang_4x4[ 7]	4.08x 	 465.00   	 1894.96

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[29\]"
intra_ang_4x4[29]	3.60x 	 377.48   	 1360.65
Subject: [x265] asm: intra_pred_ang4_8_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/1bb0face473b
branches:  
changeset: 10051:1bb0face473b
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:37:03 2015 -0700
description:
asm: intra_pred_ang4_8_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 8\]"
intra_ang_4x4[ 8]	4.34x 	 460.13   	 1994.96

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[28\]"
intra_ang_4x4[28]	3.63x 	 402.73   	 1461.14
Subject: [x265] asm: intra_pred_ang4_9_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/69ac3280d8e8
branches:  
changeset: 10052:69ac3280d8e8
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:40:26 2015 -0700
description:
asm: intra_pred_ang4_9_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[ 9\]"
intra_ang_4x4[ 9]	4.33x 	 457.50   	 1982.47

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[27\]"
intra_ang_4x4[27]	3.61x 	 402.54   	 1453.19
Subject: [x265] asm: intra_pred_ang4_10_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/8370818534eb
branches:  
changeset: 10053:8370818534eb
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:46:08 2015 -0700
description:
asm: intra_pred_ang4_10_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[10\]"
intra_ang_4x4[10]	6.40x 	 197.60   	 1264.07
Subject: [x265] asm: intra_pred_ang4_26_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/15aff00db638
branches:  
changeset: 10054:15aff00db638
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:50:10 2015 -0700
description:
asm: intra_pred_ang4_26_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[26\]"
intra_ang_4x4[26]	3.76x 	 200.35   	 754.13
Subject: [x265] asm: intra_pred_ang4_11_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/b532a1ee8ac0
branches:  
changeset: 10055:b532a1ee8ac0
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:53:42 2015 -0700
description:
asm: intra_pred_ang4_11_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[11\]"
intra_ang_4x4[11]	4.19x 	 462.50   	 1938.83

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[25\]"
intra_ang_4x4[25]	3.80x 	 384.97   	 1462.87
Subject: [x265] asm: intra_pred_ang4_12_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/4af86806ec56
branches:  
changeset: 10056:4af86806ec56
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 08:56:27 2015 -0700
description:
asm: intra_pred_ang4_12_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[12\]"
intra_ang_4x4[12]	4.40x 	 462.50   	 2032.77

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[24\]"
intra_ang_4x4[24]	3.76x 	 402.72   	 1514.26
Subject: [x265] asm: intra_pred_ang4_13_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/9ee4868f8206
branches:  
changeset: 10057:9ee4868f8206
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 09:00:07 2015 -0700
description:
asm: intra_pred_ang4_13_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[13\]"
intra_ang_4x4[13]	4.11x 	 525.09   	 2155.55

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[23\]"
intra_ang_4x4[23]	3.53x 	 460.18   	 1623.95
Subject: [x265] asm: intra_pred_ang4_14_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/52d41e99a056
branches:  
changeset: 10058:52d41e99a056
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 09:02:38 2015 -0700
description:
asm: intra_pred_ang4_14_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[14\]"
intra_ang_4x4[14]	4.25x 	 504.99   	 2147.57

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[22\]"
intra_ang_4x4[22]	3.79x 	 442.58   	 1675.51
Subject: [x265] asm: intra_pred_ang4_15_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/7a58f674172b
branches:  
changeset: 10059:7a58f674172b
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 09:05:30 2015 -0700
description:
asm: intra_pred_ang4_15_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[15\]"
intra_ang_4x4[15]	4.12x 	 502.63   	 2073.01

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[21\]"
intra_ang_4x4[21]	3.80x 	 425.10   	 1616.30
Subject: [x265] asm: intra_pred_ang4_16_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/ce56052e5afd
branches:  
changeset: 10060:ce56052e5afd
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 09:09:04 2015 -0700
description:
asm: intra_pred_ang4_16_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[16\]"
intra_ang_4x4[16]	4.30x 	 502.59   	 2162.63

transposed mode

/test/TestBench --testbench intrapred | grep "intra_ang_4x4\[20\]"
intra_ang_4x4[20]	4.03x 	 420.12   	 1693.37
Subject: [x265] asm: intra_pred_ang4_17_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/bbbee479041b
branches:  
changeset: 10061:bbbee479041b
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 09:20:42 2015 -0700
description:
asm: intra_pred_ang4_17_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[17\]"
intra_ang_4x4[17]	3.80x 	 580.18   	 2202.73

transposed mode

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[19\]"
intra_ang_4x4[19]	3.39x 	 512.79   	 1735.90
Subject: [x265] asm: intra_pred_ang4_18_sse2 16-bit

details:   http://hg.videolan.org/x265/rev/562ce7cbd673
branches:  
changeset: 10062:562ce7cbd673
user:      David T Yuen <dtyx265 at gmail.com>
date:      Fri Apr 03 09:22:47 2015 -0700
description:
asm: intra_pred_ang4_18_sse2 16-bit

This is backported from sse4 code and replaces c code.

./test/TestBench --testbench intrapred | grep "intra_ang_4x4\[18\]"
intra_ang_4x4[18]	8.54x 	 127.48   	 1088.34
Subject: [x265] cli: move raw bitstream output to separate file

details:   http://hg.videolan.org/x265/rev/4433f6cf89c7
branches:  
changeset: 10063:4433f6cf89c7
user:      Xinyue Lu <i at 7086.in>
date:      Fri Apr 03 16:22:38 2015 -0700
description:
cli: move raw bitstream output to separate file

Timebase and PTS are passed to output module
Subject: [x265] encoder: do not disable the thread pool if lookahead-slices is enabled

details:   http://hg.videolan.org/x265/rev/fa6451ec733a
branches:  
changeset: 10064:fa6451ec733a
user:      Steve Borho <steve at borho.org>
date:      Fri Apr 03 22:39:23 2015 -0500
description:
encoder: do not disable the thread pool if lookahead-slices is enabled

The user is asking for a pool feature, give them a thread-pool.

This fixes a regression test which used --no-wpp --lookahead-slices 2. By
default this had no thread pool and so lookahead-slices 2 was ignored. But if
the --pme spot-check was randomly selected, it would re-enable the pool and
re-enable lookahead-slices and the outputs would change. After this change,
the lookahead slices prevent --no-wpp from disabling the pool itself.
Subject: [x265] threading: intrdoduce poke() method for ThreadSafeInteger

details:   http://hg.videolan.org/x265/rev/1d8ce4e9eb13
branches:  stable
changeset: 10065:1d8ce4e9eb13
user:      Steve Borho <steve at borho.org>
date:      Sat Apr 04 14:04:14 2015 -0500
description:
threading: intrdoduce poke() method for ThreadSafeInteger

Sometimes we need to waken all threads that are blocked on a TSI,
but do not want to change its value (we want the blocked threads to
re-check other state variables). We were using set(get()) for this
but that is a race hazard.  We've seen one deadlock in the vicinity
of this code and so are removing this hazard even though we are not
certain this was the root cause of the deadlock.
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/e0523096bb21
branches:  
changeset: 10066:e0523096bb21
user:      Steve Borho <steve at borho.org>
date:      Sat Apr 04 15:06:40 2015 -0400
description:
Merge with stable
Subject: [x265] asm: intra_pred4_x filtering

details:   http://hg.videolan.org/x265/rev/fa7802c5d94f
branches:  
changeset: 10067:fa7802c5d94f
user:      David T Yuen <dtyx265 at gmail.com>
date:      Sat Apr 04 11:42:34 2015 -0700
description:
asm: intra_pred4_x filtering

Use r4 to hold address of constant to reduce code size
Subject: [x265] asm: intra_pred_ang4_26_sse2

details:   http://hg.videolan.org/x265/rev/ea9d4f255b91
branches:  
changeset: 10068:ea9d4f255b91
user:      David T Yuen <dtyx265 at gmail.com>
date:      Sat Apr 04 11:55:13 2015 -0700
description:
asm: intra_pred_ang4_26_sse2

changed r1 to r1d to reduce code size
Subject: [x265] asm: intra_pred_ang4_18

details:   http://hg.videolan.org/x265/rev/fca904380abf
branches:  
changeset: 10069:fca904380abf
user:      David T Yuen <dtyx265 at gmail.com>
date:      Sat Apr 04 12:00:59 2015 -0700
description:
asm: intra_pred_ang4_18

Changed third pshuflw parameter from hexadecimal to quaternary
The value is the unchanged so this patch is strictly cosmetic and therefore optional
Subject: [x265] cli: tweak output file logging

details:   http://hg.videolan.org/x265/rev/ebe5e57c4b45
branches:  
changeset: 10070:ebe5e57c4b45
user:      Steve Borho <steve at borho.org>
date:      Sat Apr 04 15:11:39 2015 -0500
description:
cli: tweak output file logging

Prefix log messages for raw output bitstreams with 'raw', presumably MKV or MP4
muxers would use "mkv" or "mp4"

diffstat:

 doc/reST/cli.rst                     |    8 +-
 doc/reST/threading.rst               |    1 +
 source/common/threading.h            |   16 +
 source/common/x86/asm-primitives.cpp |   33 ++
 source/common/x86/intrapred16.asm    |  502 +++++++++++++++++++++++++++++++++++
 source/encoder/encoder.cpp           |    6 +-
 source/encoder/ratecontrol.cpp       |    4 +-
 source/input/input.h                 |    2 +
 source/output/output.cpp             |   10 +-
 source/output/output.h               |   33 ++-
 source/output/raw.cpp                |   77 +++++
 source/output/raw.h                  |   64 ++++
 source/x265.cpp                      |   83 +++-
 13 files changed, 803 insertions(+), 36 deletions(-)

diffs (truncated from 1128 to 300 lines):

diff -r 335c728bbd62 -r ebe5e57c4b45 doc/reST/cli.rst
--- a/doc/reST/cli.rst	Fri Apr 03 14:27:32 2015 -0500
+++ b/doc/reST/cli.rst	Sat Apr 04 15:11:39 2015 -0500
@@ -201,11 +201,11 @@ Performance Options
 	their node, they will not be allowed to migrate between nodes, but they
 	will be allowed to move between CPU cores within their node.
 
-	If the three pool features: :option:`--wpp` :option:`--pmode` and
-	:option:`--pme` are all disabled, then :option:`--pools` is ignored
-	and no thread pools are created.
+	If the four pool features: :option:`--wpp`, :option:`--pmode`,
+	:option:`--pme` and :option:`--lookahead-slices` are all disabled,
+	then :option:`--pools` is ignored and no thread pools are created.
 
-	If "none" is specified, then all three of the thread pool features are
+	If "none" is specified, then all four of the thread pool features are
 	implicitly disabled.
 
 	Multiple thread pools will be allocated for any NUMA node with more than
diff -r 335c728bbd62 -r ebe5e57c4b45 doc/reST/threading.rst
--- a/doc/reST/threading.rst	Fri Apr 03 14:27:32 2015 -0500
+++ b/doc/reST/threading.rst	Sat Apr 04 15:11:39 2015 -0500
@@ -225,6 +225,7 @@ scene cuts and slice types) uses the thr
 lowres cost analysis to worker threads. It will use bonded task groups
 to perform batches of frame cost estimates, and it may optionally use
 bonded task groups to measure single frame cost estimates using slices.
+(see :option:`--lookahead-slices`)
 
 The function slicetypeDecide() itself is also be performed by a worker
 thread if your encoder has a thread pool, else it runs within the
diff -r 335c728bbd62 -r ebe5e57c4b45 source/common/threading.h
--- a/source/common/threading.h	Fri Apr 03 14:27:32 2015 -0500
+++ b/source/common/threading.h	Sat Apr 04 15:11:39 2015 -0500
@@ -189,6 +189,14 @@ public:
         LeaveCriticalSection(&m_cs);
     }
 
+    void poke(void)
+    {
+        /* awaken all waiting threads, but make no change */
+        EnterCriticalSection(&m_cs);
+        WakeAllConditionVariable(&m_cv);
+        LeaveCriticalSection(&m_cs);
+    }
+
     void incr()
     {
         EnterCriticalSection(&m_cs);
@@ -370,6 +378,14 @@ public:
         pthread_mutex_unlock(&m_mutex);
     }
 
+    void poke(void)
+    {
+        /* awaken all waiting threads, but make no change */
+        pthread_mutex_lock(&m_mutex);
+        pthread_cond_broadcast(&m_cond);
+        pthread_mutex_unlock(&m_mutex);
+    }
+
     void incr()
     {
         pthread_mutex_lock(&m_mutex);
diff -r 335c728bbd62 -r ebe5e57c4b45 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Apr 03 14:27:32 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Sat Apr 04 15:11:39 2015 -0500
@@ -879,6 +879,39 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_sse2;
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_sse2;
 
+        p.cu[BLOCK_4x4].intra_pred[2] = x265_intra_pred_ang4_2_sse2;
+        p.cu[BLOCK_4x4].intra_pred[3] = x265_intra_pred_ang4_3_sse2;
+        p.cu[BLOCK_4x4].intra_pred[4] = x265_intra_pred_ang4_4_sse2;
+        p.cu[BLOCK_4x4].intra_pred[5] = x265_intra_pred_ang4_5_sse2;
+        p.cu[BLOCK_4x4].intra_pred[6] = x265_intra_pred_ang4_6_sse2;
+        p.cu[BLOCK_4x4].intra_pred[7] = x265_intra_pred_ang4_7_sse2;
+        p.cu[BLOCK_4x4].intra_pred[8] = x265_intra_pred_ang4_8_sse2;
+        p.cu[BLOCK_4x4].intra_pred[9] = x265_intra_pred_ang4_9_sse2;
+        p.cu[BLOCK_4x4].intra_pred[10] = x265_intra_pred_ang4_10_sse2;
+        p.cu[BLOCK_4x4].intra_pred[11] = x265_intra_pred_ang4_11_sse2;
+        p.cu[BLOCK_4x4].intra_pred[12] = x265_intra_pred_ang4_12_sse2;
+        p.cu[BLOCK_4x4].intra_pred[13] = x265_intra_pred_ang4_13_sse2;
+        p.cu[BLOCK_4x4].intra_pred[14] = x265_intra_pred_ang4_14_sse2;
+        p.cu[BLOCK_4x4].intra_pred[15] = x265_intra_pred_ang4_15_sse2;
+        p.cu[BLOCK_4x4].intra_pred[16] = x265_intra_pred_ang4_16_sse2;
+        p.cu[BLOCK_4x4].intra_pred[17] = x265_intra_pred_ang4_17_sse2;
+        p.cu[BLOCK_4x4].intra_pred[18] = x265_intra_pred_ang4_18_sse2;
+        p.cu[BLOCK_4x4].intra_pred[19] = x265_intra_pred_ang4_17_sse2;
+        p.cu[BLOCK_4x4].intra_pred[20] = x265_intra_pred_ang4_16_sse2;
+        p.cu[BLOCK_4x4].intra_pred[21] = x265_intra_pred_ang4_15_sse2;
+        p.cu[BLOCK_4x4].intra_pred[22] = x265_intra_pred_ang4_14_sse2;
+        p.cu[BLOCK_4x4].intra_pred[23] = x265_intra_pred_ang4_13_sse2;
+        p.cu[BLOCK_4x4].intra_pred[24] = x265_intra_pred_ang4_12_sse2;
+        p.cu[BLOCK_4x4].intra_pred[25] = x265_intra_pred_ang4_11_sse2;
+        p.cu[BLOCK_4x4].intra_pred[26] = x265_intra_pred_ang4_26_sse2;
+        p.cu[BLOCK_4x4].intra_pred[27] = x265_intra_pred_ang4_9_sse2;
+        p.cu[BLOCK_4x4].intra_pred[28] = x265_intra_pred_ang4_8_sse2;
+        p.cu[BLOCK_4x4].intra_pred[29] = x265_intra_pred_ang4_7_sse2;
+        p.cu[BLOCK_4x4].intra_pred[30] = x265_intra_pred_ang4_6_sse2;
+        p.cu[BLOCK_4x4].intra_pred[31] = x265_intra_pred_ang4_5_sse2;
+        p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
+        p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
+
         p.cu[BLOCK_4x4].sse_ss = x265_pixel_ssd_ss_4x4_mmx2;
         ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
 
diff -r 335c728bbd62 -r ebe5e57c4b45 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Fri Apr 03 14:27:32 2015 -0500
+++ b/source/common/x86/intrapred16.asm	Sat Apr 04 15:11:39 2015 -0500
@@ -690,6 +690,508 @@ cglobal intra_pred_planar32, 3,3,16
 %endrep
     RET
 
+;-----------------------------------------------------------------------------------------
+; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;-----------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal intra_pred_ang4_2, 3,5,4
+    lea         r4,            [r2 + 4]
+    add         r2,            20
+    cmp         r3m,           byte 34
+    cmove       r2,            r4
+
+    add         r1,            r1
+    movu        m0,            [r2]
+    movh        [r0],          m0
+    psrldq      m0,            2
+    movh        [r0 + r1],     m0
+    psrldq      m0,            2
+    movh        [r0 + r1 * 2], m0
+    lea         r1,            [r1 * 3]
+    psrldq      m0,            2
+    movh        [r0 + r1],     m0
+    RET
+
+cglobal intra_pred_ang4_3, 3,5,8
+    mov         r4d, 2
+    cmp         r3m, byte 33
+    mov         r3d, 18
+    cmove       r3d, r4d
+
+    movu        m0, [r2 + r3]   ; [8 7 6 5 4 3 2 1]
+
+    mova        m2, m0
+    psrldq      m0, 2
+    punpcklwd   m2, m0      ; [5 4 4 3 3 2 2 1]
+    mova        m3, m0
+    psrldq      m0, 2
+    punpcklwd   m3, m0      ; [6 5 5 4 4 3 3 2]
+    mova        m4, m0
+    psrldq      m0, 2
+    punpcklwd   m4, m0      ; [7 6 6 5 5 4 4 3]
+    mova        m5, m0
+    psrldq      m0, 2
+    punpcklwd   m5, m0      ; [8 7 7 6 6 5 5 4]
+
+
+    lea         r3, [ang_table + 20 * 16]
+    mova        m0, [r3 + 6 * 16]   ; [26]
+    mova        m1, [r3]            ; [20]
+    mova        m6, [r3 - 6 * 16]   ; [14]
+    mova        m7, [r3 - 12 * 16]  ; [ 8]
+    jmp        .do_filter4x4
+
+
+ALIGN 16
+.do_filter4x4:
+    lea     r4, [pd_16]
+    pmaddwd m2, m0
+    paddd   m2, [r4]
+    psrld   m2, 5
+
+    pmaddwd m3, m1
+    paddd   m3, [r4]
+    psrld   m3, 5
+    packssdw m2, m3
+
+    pmaddwd m4, m6
+    paddd   m4, [r4]
+    psrld   m4, 5
+
+    pmaddwd m5, m7
+    paddd   m5, [r4]
+    psrld   m5, 5
+    packssdw m4, m5
+
+    jz         .store
+
+    ; transpose 4x4
+    punpckhwd    m0, m2, m4
+    punpcklwd    m2, m4
+    punpckhwd    m4, m2, m0
+    punpcklwd    m2, m0
+
+.store:
+    add         r1, r1
+    movh        [r0], m2
+    movhps      [r0 + r1], m2
+    movh        [r0 + r1 * 2], m4
+    lea         r1, [r1 * 3]
+    movhps      [r0 + r1], m4
+    RET
+
+cglobal intra_pred_ang4_4, 3,5,8
+    mov         r4d, 2
+    cmp         r3m, byte 32
+    mov         r3d, 18
+    cmove       r3d, r4d
+
+    movu        m0, [r2 + r3]   ; [8 7 6 5 4 3 2 1]
+    mova        m2, m0
+    psrldq      m0, 2
+    punpcklwd   m2, m0      ; [5 4 4 3 3 2 2 1]
+    mova        m3, m0
+    psrldq      m0, 2
+    punpcklwd   m3, m0      ; [6 5 5 4 4 3 3 2]
+    mova        m4, m3
+    mova        m5, m0
+    psrldq      m0, 2
+    punpcklwd   m5, m0      ; [7 6 6 5 5 4 4 3]
+
+    lea         r3, [ang_table + 18 * 16]
+    mova        m0, [r3 +  3 * 16]  ; [21]
+    mova        m1, [r3 -  8 * 16]  ; [10]
+    mova        m6, [r3 + 13 * 16]  ; [31]
+    mova        m7, [r3 +  2 * 16]  ; [20]
+    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_5, 3,5,8
+    mov         r4d, 2
+    cmp         r3m, byte 31
+    mov         r3d, 18
+    cmove       r3d, r4d
+
+    movu        m0, [r2 + r3]   ; [8 7 6 5 4 3 2 1]
+    mova        m2, m0
+    psrldq      m0, 2
+    punpcklwd   m2, m0      ; [5 4 4 3 3 2 2 1]
+    mova        m3, m0
+    psrldq      m0, 2
+    punpcklwd   m3, m0      ; [6 5 5 4 4 3 3 2]
+    mova        m4, m3
+    mova        m5, m0
+    psrldq      m0, 2
+    punpcklwd   m5, m0      ; [7 6 6 5 5 4 4 3]
+
+    lea         r3, [ang_table + 10 * 16]
+    mova        m0, [r3 +  7 * 16]  ; [17]
+    mova        m1, [r3 -  8 * 16]  ; [ 2]
+    mova        m6, [r3 +  9 * 16]  ; [19]
+    mova        m7, [r3 -  6 * 16]  ; [ 4]
+    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_6, 3,5,8
+    mov         r4d, 2
+    cmp         r3m, byte 30
+    mov         r3d, 18
+    cmove       r3d, r4d
+
+    movu        m0, [r2 + r3]   ; [8 7 6 5 4 3 2 1]
+    mova        m2, m0
+    psrldq      m0, 2
+    punpcklwd   m2, m0      ; [5 4 4 3 3 2 2 1]
+    mova        m3, m2
+    mova        m4, m0
+    psrldq      m0, 2
+    punpcklwd   m4, m0      ; [6 5 5 4 4 3 3 2]
+    mova        m5, m4
+
+    lea         r3, [ang_table + 19 * 16]
+    mova        m0, [r3 -  6 * 16]  ; [13]
+    mova        m1, [r3 +  7 * 16]  ; [26]
+    mova        m6, [r3 - 12 * 16]  ; [ 7]
+    mova        m7, [r3 +  1 * 16]  ; [20]
+    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_7, 3,5,8
+    mov         r4d, 2
+    cmp         r3m, byte 29
+    mov         r3d, 18
+    cmove       r3d, r4d
+
+    movu        m0, [r2 + r3]   ; [8 7 6 5 4 3 2 1]
+    mova        m2, m0
+    psrldq      m0, 2
+    punpcklwd   m2, m0      ; [5 4 4 3 3 2 2 1]
+    mova        m3, m2
+    mova        m4, m2
+    mova        m5, m0
+    psrldq      m0, 2
+    punpcklwd   m5, m0      ; [6 5 5 4 4 3 3 2]
+
+    lea         r3, [ang_table + 20 * 16]
+    mova        m0, [r3 - 11 * 16]  ; [ 9]
+    mova        m1, [r3 -  2 * 16]  ; [18]
+    mova        m6, [r3 +  7 * 16]  ; [27]
+    mova        m7, [r3 - 16 * 16]  ; [ 4]
+    jmp         mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+
+cglobal intra_pred_ang4_8, 3,5,8


More information about the x265-commits mailing list