[x265] [PATCH 2/6] AArch64: Optimize SBD interp_hv_pp_neon function

Thu Jun 19 08:36:10 UTC 2025

The existing interp_hv_pp_neon function performs horizontal
interpolation, stores the result in an intermediate buffer, and then
applies vertical interpolation on that buffer. This can be optimized
by merging the horizontal and vertical interpolation eliminating the
need for the intermediate buffer.

This optimization is applicable to all block sizes and gives a
performance uplift of up to 8%.

Co-authored by: Arpad Panyik arpad.panyik at arm.com
---
 source/common/aarch64/filter-prim.cpp | 362 +++++++++++++++++++++++++-
 1 file changed, 360 insertions(+), 2 deletions(-)

diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index e4f4e4773..c28572881 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -2060,6 +2060,286 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst,
     }
 }
 
+template<int coeffIdx, int coeffIdy, int width, int height>
+void interp8_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                        intptr_t dstStride)
+{
+    const int N_TAPS = 8;
+    const int v_shift = IF_FILTER_PREC + IF_INTERNAL_PREC - X265_DEPTH;
+    // Subtract 8 from shift since we account for that in table lookups.
+    const int v_shift_offset = v_shift - 8;
+    const int16x8_t v_filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdy]);
+    const uint16x8_t h_offset = vdupq_n_u16((uint16_t)-IF_INTERNAL_OFFS);
+    const int32x4_t v_offset = vdupq_n_s32((1 << (v_shift - 1)) +
+                                           (IF_INTERNAL_OFFS << IF_FILTER_PREC));
+    const uint8x16_t shr_tbl = vld1q_u8(vert_shr_tbl);
+
+    src -= (N_TAPS / 2 - 1) * srcStride + (N_TAPS / 2 - 1);
+
+    int col = 0;
+    for (; col + 16 <= width; col += 16)
+    {
+        const pixel *s = src;
+        pixel *d = dst;
+
+        uint8x16_t h_s[N_TAPS];
+        int16x8_t v_s0[11], v_s1[11];
+
+        load_u8x16xn<8>(s + 0 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[0], v_s1[0]);
+
+        load_u8x16xn<8>(s + 1 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[1], v_s1[1]);
+
+        load_u8x16xn<8>(s + 2 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[2], v_s1[2]);
+
+        load_u8x16xn<8>(s + 3 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[3], v_s1[3]);
+
+        load_u8x16xn<8>(s + 4 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[4], v_s1[4]);
+
+        load_u8x16xn<8>(s + 5 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[5], v_s1[5]);
+
+        load_u8x16xn<8>(s + 6 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[6], v_s1[6]);
+
+        s += 7 * srcStride;
+
+        for (int row = 0; row < height; row += 4)
+        {
+            uint8x8_t res_lo[4], res_hi[4];
+            int32x4_t sum_lo[8], sum_hi[8];
+
+            load_u8x16xn<8>(s + 0 * srcStride, 1, h_s);
+            filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[7], v_s1[7]);
+            filter8_s16x8<coeffIdy>(v_s0 + 0, v_filter, v_offset, sum_lo[0], sum_hi[0]);
+            filter8_s16x8<coeffIdy>(v_s1 + 0, v_filter, v_offset, sum_lo[1], sum_hi[1]);
+            v_s0[0] = v_s0[4];
+            v_s1[0] = v_s1[4];
+            res_lo[0] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[0], sum_hi[0], shr_tbl),
+                                      v_shift_offset);
+            res_hi[0] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[1], sum_hi[1], shr_tbl),
+                                      v_shift_offset);
+
+            load_u8x16xn<8>(s + 1 * srcStride, 1, h_s);
+            filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[8], v_s1[8]);
+            filter8_s16x8<coeffIdy>(v_s0 + 1, v_filter, v_offset, sum_lo[2], sum_hi[2]);
+            filter8_s16x8<coeffIdy>(v_s1 + 1, v_filter, v_offset, sum_lo[3], sum_hi[3]);
+            v_s0[1] = v_s0[5];
+            v_s1[1] = v_s1[5];
+            res_lo[1] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[2], sum_hi[2], shr_tbl),
+                                      v_shift_offset);
+            res_hi[1] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[3], sum_hi[3], shr_tbl),
+                                      v_shift_offset);
+
+            load_u8x16xn<8>(s + 2 * srcStride, 1, h_s);
+            filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[9], v_s1[9]);
+            filter8_s16x8<coeffIdy>(v_s0 + 2, v_filter, v_offset, sum_lo[4], sum_hi[4]);
+            filter8_s16x8<coeffIdy>(v_s1 + 2, v_filter, v_offset, sum_lo[5], sum_hi[5]);
+            v_s0[2] = v_s0[6];
+            v_s1[2] = v_s1[6];
+            res_lo[2] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[4], sum_hi[4], shr_tbl),
+                                      v_shift_offset);
+            res_hi[2] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[5], sum_hi[5], shr_tbl),
+                                      v_shift_offset);
+
+            load_u8x16xn<8>(s + 3 * srcStride, 1, h_s);
+            filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[10], v_s1[10]);
+            filter8_s16x8<coeffIdy>(v_s0 + 3, v_filter, v_offset, sum_lo[6], sum_hi[6]);
+            filter8_s16x8<coeffIdy>(v_s1 + 3, v_filter, v_offset, sum_lo[7], sum_hi[7]);
+            v_s0[3] = v_s0[7];
+            v_s1[3] = v_s1[7];
+            res_lo[3] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[6], sum_hi[6], shr_tbl),
+                                      v_shift_offset);
+            res_hi[3] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[7], sum_hi[7], shr_tbl),
+                                      v_shift_offset);
+
+            vst1q_u8(d + 0 * dstStride, vcombine_u8(res_lo[0], res_hi[0]));
+            vst1q_u8(d + 1 * dstStride, vcombine_u8(res_lo[1], res_hi[1]));
+            vst1q_u8(d + 2 * dstStride, vcombine_u8(res_lo[2], res_hi[2]));
+            vst1q_u8(d + 3 * dstStride, vcombine_u8(res_lo[3], res_hi[3]));
+
+            v_s0[4] = v_s0[8];
+            v_s1[4] = v_s1[8];
+            v_s0[5] = v_s0[9];
+            v_s1[5] = v_s1[9];
+            v_s0[6] = v_s0[10];
+            v_s1[6] = v_s1[10];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+
+    for (; col + 8 <= width; col += 8)
+    {
+        const pixel *s = src;
+        pixel *d = dst;
+
+        uint8x8_t h_s[N_TAPS];
+        int16x8_t v_s[11];
+
+        load_u8x8xn<8>(s + 0 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[0]);
+
+        load_u8x8xn<8>(s + 1 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[1]);
+
+        load_u8x8xn<8>(s + 2 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[2]);
+
+        load_u8x8xn<8>(s + 3 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[3]);
+
+        load_u8x8xn<8>(s + 4 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[4]);
+
+        load_u8x8xn<8>(s + 5 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[5]);
+
+        load_u8x8xn<8>(s + 6 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[6]);
+
+        s += 7 * srcStride;
+
+        for (int row = 0; row < height; row += 4)
+        {
+            uint8x8_t res[4];
+            int32x4_t sum_lo[4], sum_hi[4];
+
+            load_u8x8xn<8>(s + 0 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[7]);
+            filter8_s16x8<coeffIdy>(v_s + 0, v_filter, v_offset, sum_lo[0], sum_hi[0]);
+
+            load_u8x8xn<8>(s + 1 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[8]);
+            filter8_s16x8<coeffIdy>(v_s + 1, v_filter, v_offset, sum_lo[1], sum_hi[1]);
+            v_s[0] = v_s[4];
+            v_s[1] = v_s[5];
+            res[0] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[0], sum_hi[0], shr_tbl),
+                                   v_shift_offset);
+            res[1] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[1], sum_hi[1], shr_tbl),
+                                   v_shift_offset);
+
+            load_u8x8xn<8>(s + 2 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[9]);
+            filter8_s16x8<coeffIdy>(v_s + 2, v_filter, v_offset, sum_lo[2], sum_hi[2]);
+
+            load_u8x8xn<8>(s + 3 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[10]);
+            filter8_s16x8<coeffIdy>(v_s + 3, v_filter, v_offset, sum_lo[3], sum_hi[3]);
+            v_s[2] = v_s[6];
+            v_s[3] = v_s[7];
+            res[2] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[2], sum_hi[2], shr_tbl),
+                                   v_shift_offset);
+            res[3] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[3], sum_hi[3], shr_tbl),
+                                   v_shift_offset);
+
+            store_u8xnxm<8, 4>(d + 0, dstStride, res);
+
+            v_s[4] = v_s[8];
+            v_s[5] = v_s[9];
+            v_s[6] = v_s[10];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+
+    if (width % 8 != 0)
+    {
+        const pixel *s = src;
+        pixel *d = dst;
+
+        uint8x8_t h_s[N_TAPS];
+        int16x8_t t_v_s[11];
+        int16x4_t v_s[11];
+
+        load_u8x8xn<8>(s + 0 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[0]);
+        v_s[0] = vget_low_s16(t_v_s[0]);
+
+        load_u8x8xn<8>(s + 1 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[1]);
+        v_s[1] = vget_low_s16(t_v_s[1]);
+
+        load_u8x8xn<8>(s + 2 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[2]);
+        v_s[2] = vget_low_s16(t_v_s[2]);
+
+        load_u8x8xn<8>(s + 3 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[3]);
+        v_s[3] = vget_low_s16(t_v_s[3]);
+
+        load_u8x8xn<8>(s + 4 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[4]);
+        v_s[4] = vget_low_s16(t_v_s[4]);
+
+        load_u8x8xn<8>(s + 5 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[5]);
+        v_s[5] = vget_low_s16(t_v_s[5]);
+
+        load_u8x8xn<8>(s + 6 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[6]);
+        v_s[6] = vget_low_s16(t_v_s[6]);
+
+        s += 7 * srcStride;
+
+        for (int row = 0; row < height; row += 4)
+        {
+            int32x4_t sum[4];
+            uint8x8_t res[2];
+
+            load_u8x8xn<8>(s + 0 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[7]);
+            v_s[7] = vget_low_s16(t_v_s[7]);
+            filter8_s16x4<coeffIdy>(v_s + 0, v_filter, v_offset, sum[0]);
+            v_s[0] = v_s[4];
+
+            load_u8x8xn<8>(s + 1 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[8]);
+            v_s[8] = vget_low_s16(t_v_s[8]);
+            filter8_s16x4<coeffIdy>(v_s + 1, v_filter, v_offset, sum[1]);
+            v_s[1] = v_s[5];
+
+            load_u8x8xn<8>(s + 2 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[9]);
+            v_s[9] = vget_low_s16(t_v_s[9]);
+            filter8_s16x4<coeffIdy>(v_s + 2, v_filter, v_offset, sum[2]);
+            v_s[2] = v_s[6];
+
+            load_u8x8xn<8>(s + 3 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[10]);
+            v_s[10] = vget_low_s16(t_v_s[10]);
+            filter8_s16x4<coeffIdy>(v_s + 3, v_filter, v_offset, sum[3]);
+            v_s[3] = v_s[7];
+
+            res[0] = vqshrun_n_s16(vtbl2q_s32_s16(sum[0], sum[1], shr_tbl),
+                                   v_shift_offset);
+            res[1] = vqshrun_n_s16(vtbl2q_s32_s16(sum[2], sum[3], shr_tbl),
+                                   v_shift_offset);
+
+            store_u8x4_strided_xN<4>(d + 0 * dstStride, dstStride, res);
+
+            v_s[4] = v_s[8];
+            v_s[5] = v_s[9];
+            v_s[6] = v_s[10];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+    }
+}
+
 #else // !HIGH_BIT_DEPTH
 
 #if X265_DEPTH == 10
@@ -4735,15 +5015,93 @@ void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst,
     }
 }
 
+#if HIGH_BIT_DEPTH
+template<int N, int width, int height>
+void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                       intptr_t dstStride, int idxX, int idxY)
+{
+    ALIGN_VAR_32(int16_t, immed[width * (height + N - 1)]);
+
+    interp_horiz_ps_neon<N, width, height>(src, srcStride, immed, width, idxX, 1);
+    interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst,
+                                          dstStride, idxY);
+}
+
+#else // HIGH_BIT_DEPTH
+
 template<int N, int width, int height>
-void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                       intptr_t dstStride, int idxX, int idxY)
 {
+// Use the merged hv paths with Clang only as performance with GCC is worse than the
+// existing approach of doing horizontal and vertical interpolation separately.
+#ifdef __clang__
+    switch (idxX)
+    {
+    case 1:
+    {
+        switch (idxY)
+        {
+        case 1:
+            return interp8_hv_pp_neon<1, 1, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 2:
+            return interp8_hv_pp_neon<1, 2, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 3:
+            return interp8_hv_pp_neon<1, 3, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        }
+
+        break;
+    }
+    case 2:
+    {
+        switch (idxY)
+        {
+        case 1:
+            return interp8_hv_pp_neon<2, 1, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 2:
+            return interp8_hv_pp_neon<2, 2, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 3:
+            return interp8_hv_pp_neon<2, 3, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        }
+
+        break;
+    }
+    case 3:
+    {
+        switch (idxY)
+        {
+        case 1:
+            return interp8_hv_pp_neon<3, 1, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 2:
+            return interp8_hv_pp_neon<3, 2, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 3:
+            return interp8_hv_pp_neon<3, 3, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        }
+
+        break;
+    }
+    }
+
+#else // __clang__
     ALIGN_VAR_32(int16_t, immed[width * (height + N - 1)]);
 
     interp_horiz_ps_neon<N, width, height>(src, srcStride, immed, width, idxX, 1);
-    interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
+    interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst,
+                                          dstStride, idxY);
+#endif // __clang__
 }
 
+#endif // HIGH_BIT_DEPTH
+
 template<int width, int height>
 void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
 {
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From ae68b9b9f2f3d730504a70242c51cc761cf1e519 Mon Sep 17 00:00:00 2001
Message-Id: <ae68b9b9f2f3d730504a70242c51cc761cf1e519.1750321821.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1750321821.git.gerdazsejke.more at arm.com>
References: <cover.1750321821.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Tue, 6 May 2025 13:16:02 +0300
Subject: [PATCH 2/6] AArch64: Optimize SBD interp_hv_pp_neon function

The existing interp_hv_pp_neon function performs horizontal
interpolation, stores the result in an intermediate buffer, and then
applies vertical interpolation on that buffer. This can be optimized
by merging the horizontal and vertical interpolation eliminating the
need for the intermediate buffer.

This optimization is applicable to all block sizes and gives a
performance uplift of up to 8%.

Co-authored by: Arpad Panyik arpad.panyik at arm.com
---
 source/common/aarch64/filter-prim.cpp | 362 +++++++++++++++++++++++++-
 1 file changed, 360 insertions(+), 2 deletions(-)

diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index e4f4e4773..c28572881 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -2060,6 +2060,286 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst,
     }
 }
 
+template<int coeffIdx, int coeffIdy, int width, int height>
+void interp8_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                        intptr_t dstStride)
+{
+    const int N_TAPS = 8;
+    const int v_shift = IF_FILTER_PREC + IF_INTERNAL_PREC - X265_DEPTH;
+    // Subtract 8 from shift since we account for that in table lookups.
+    const int v_shift_offset = v_shift - 8;
+    const int16x8_t v_filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdy]);
+    const uint16x8_t h_offset = vdupq_n_u16((uint16_t)-IF_INTERNAL_OFFS);
+    const int32x4_t v_offset = vdupq_n_s32((1 << (v_shift - 1)) +
+                                           (IF_INTERNAL_OFFS << IF_FILTER_PREC));
+    const uint8x16_t shr_tbl = vld1q_u8(vert_shr_tbl);
+
+    src -= (N_TAPS / 2 - 1) * srcStride + (N_TAPS / 2 - 1);
+
+    int col = 0;
+    for (; col + 16 <= width; col += 16)
+    {
+        const pixel *s = src;
+        pixel *d = dst;
+
+        uint8x16_t h_s[N_TAPS];
+        int16x8_t v_s0[11], v_s1[11];
+
+        load_u8x16xn<8>(s + 0 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[0], v_s1[0]);
+
+        load_u8x16xn<8>(s + 1 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[1], v_s1[1]);
+
+        load_u8x16xn<8>(s + 2 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[2], v_s1[2]);
+
+        load_u8x16xn<8>(s + 3 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[3], v_s1[3]);
+
+        load_u8x16xn<8>(s + 4 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[4], v_s1[4]);
+
+        load_u8x16xn<8>(s + 5 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[5], v_s1[5]);
+
+        load_u8x16xn<8>(s + 6 * srcStride, 1, h_s);
+        filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[6], v_s1[6]);
+
+        s += 7 * srcStride;
+
+        for (int row = 0; row < height; row += 4)
+        {
+            uint8x8_t res_lo[4], res_hi[4];
+            int32x4_t sum_lo[8], sum_hi[8];
+
+            load_u8x16xn<8>(s + 0 * srcStride, 1, h_s);
+            filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[7], v_s1[7]);
+            filter8_s16x8<coeffIdy>(v_s0 + 0, v_filter, v_offset, sum_lo[0], sum_hi[0]);
+            filter8_s16x8<coeffIdy>(v_s1 + 0, v_filter, v_offset, sum_lo[1], sum_hi[1]);
+            v_s0[0] = v_s0[4];
+            v_s1[0] = v_s1[4];
+            res_lo[0] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[0], sum_hi[0], shr_tbl),
+                                      v_shift_offset);
+            res_hi[0] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[1], sum_hi[1], shr_tbl),
+                                      v_shift_offset);
+
+            load_u8x16xn<8>(s + 1 * srcStride, 1, h_s);
+            filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[8], v_s1[8]);
+            filter8_s16x8<coeffIdy>(v_s0 + 1, v_filter, v_offset, sum_lo[2], sum_hi[2]);
+            filter8_s16x8<coeffIdy>(v_s1 + 1, v_filter, v_offset, sum_lo[3], sum_hi[3]);
+            v_s0[1] = v_s0[5];
+            v_s1[1] = v_s1[5];
+            res_lo[1] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[2], sum_hi[2], shr_tbl),
+                                      v_shift_offset);
+            res_hi[1] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[3], sum_hi[3], shr_tbl),
+                                      v_shift_offset);
+
+            load_u8x16xn<8>(s + 2 * srcStride, 1, h_s);
+            filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[9], v_s1[9]);
+            filter8_s16x8<coeffIdy>(v_s0 + 2, v_filter, v_offset, sum_lo[4], sum_hi[4]);
+            filter8_s16x8<coeffIdy>(v_s1 + 2, v_filter, v_offset, sum_lo[5], sum_hi[5]);
+            v_s0[2] = v_s0[6];
+            v_s1[2] = v_s1[6];
+            res_lo[2] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[4], sum_hi[4], shr_tbl),
+                                      v_shift_offset);
+            res_hi[2] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[5], sum_hi[5], shr_tbl),
+                                      v_shift_offset);
+
+            load_u8x16xn<8>(s + 3 * srcStride, 1, h_s);
+            filter8_u8x16<coeffIdx>(h_s, h_offset, v_s0[10], v_s1[10]);
+            filter8_s16x8<coeffIdy>(v_s0 + 3, v_filter, v_offset, sum_lo[6], sum_hi[6]);
+            filter8_s16x8<coeffIdy>(v_s1 + 3, v_filter, v_offset, sum_lo[7], sum_hi[7]);
+            v_s0[3] = v_s0[7];
+            v_s1[3] = v_s1[7];
+            res_lo[3] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[6], sum_hi[6], shr_tbl),
+                                      v_shift_offset);
+            res_hi[3] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[7], sum_hi[7], shr_tbl),
+                                      v_shift_offset);
+
+            vst1q_u8(d + 0 * dstStride, vcombine_u8(res_lo[0], res_hi[0]));
+            vst1q_u8(d + 1 * dstStride, vcombine_u8(res_lo[1], res_hi[1]));
+            vst1q_u8(d + 2 * dstStride, vcombine_u8(res_lo[2], res_hi[2]));
+            vst1q_u8(d + 3 * dstStride, vcombine_u8(res_lo[3], res_hi[3]));
+
+            v_s0[4] = v_s0[8];
+            v_s1[4] = v_s1[8];
+            v_s0[5] = v_s0[9];
+            v_s1[5] = v_s1[9];
+            v_s0[6] = v_s0[10];
+            v_s1[6] = v_s1[10];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+
+        src += 16;
+        dst += 16;
+    }
+
+    for (; col + 8 <= width; col += 8)
+    {
+        const pixel *s = src;
+        pixel *d = dst;
+
+        uint8x8_t h_s[N_TAPS];
+        int16x8_t v_s[11];
+
+        load_u8x8xn<8>(s + 0 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[0]);
+
+        load_u8x8xn<8>(s + 1 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[1]);
+
+        load_u8x8xn<8>(s + 2 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[2]);
+
+        load_u8x8xn<8>(s + 3 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[3]);
+
+        load_u8x8xn<8>(s + 4 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[4]);
+
+        load_u8x8xn<8>(s + 5 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[5]);
+
+        load_u8x8xn<8>(s + 6 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[6]);
+
+        s += 7 * srcStride;
+
+        for (int row = 0; row < height; row += 4)
+        {
+            uint8x8_t res[4];
+            int32x4_t sum_lo[4], sum_hi[4];
+
+            load_u8x8xn<8>(s + 0 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[7]);
+            filter8_s16x8<coeffIdy>(v_s + 0, v_filter, v_offset, sum_lo[0], sum_hi[0]);
+
+            load_u8x8xn<8>(s + 1 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[8]);
+            filter8_s16x8<coeffIdy>(v_s + 1, v_filter, v_offset, sum_lo[1], sum_hi[1]);
+            v_s[0] = v_s[4];
+            v_s[1] = v_s[5];
+            res[0] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[0], sum_hi[0], shr_tbl),
+                                   v_shift_offset);
+            res[1] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[1], sum_hi[1], shr_tbl),
+                                   v_shift_offset);
+
+            load_u8x8xn<8>(s + 2 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[9]);
+            filter8_s16x8<coeffIdy>(v_s + 2, v_filter, v_offset, sum_lo[2], sum_hi[2]);
+
+            load_u8x8xn<8>(s + 3 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, v_s[10]);
+            filter8_s16x8<coeffIdy>(v_s + 3, v_filter, v_offset, sum_lo[3], sum_hi[3]);
+            v_s[2] = v_s[6];
+            v_s[3] = v_s[7];
+            res[2] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[2], sum_hi[2], shr_tbl),
+                                   v_shift_offset);
+            res[3] = vqshrun_n_s16(vtbl2q_s32_s16(sum_lo[3], sum_hi[3], shr_tbl),
+                                   v_shift_offset);
+
+            store_u8xnxm<8, 4>(d + 0, dstStride, res);
+
+            v_s[4] = v_s[8];
+            v_s[5] = v_s[9];
+            v_s[6] = v_s[10];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+
+        src += 8;
+        dst += 8;
+    }
+
+    if (width % 8 != 0)
+    {
+        const pixel *s = src;
+        pixel *d = dst;
+
+        uint8x8_t h_s[N_TAPS];
+        int16x8_t t_v_s[11];
+        int16x4_t v_s[11];
+
+        load_u8x8xn<8>(s + 0 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[0]);
+        v_s[0] = vget_low_s16(t_v_s[0]);
+
+        load_u8x8xn<8>(s + 1 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[1]);
+        v_s[1] = vget_low_s16(t_v_s[1]);
+
+        load_u8x8xn<8>(s + 2 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[2]);
+        v_s[2] = vget_low_s16(t_v_s[2]);
+
+        load_u8x8xn<8>(s + 3 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[3]);
+        v_s[3] = vget_low_s16(t_v_s[3]);
+
+        load_u8x8xn<8>(s + 4 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[4]);
+        v_s[4] = vget_low_s16(t_v_s[4]);
+
+        load_u8x8xn<8>(s + 5 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[5]);
+        v_s[5] = vget_low_s16(t_v_s[5]);
+
+        load_u8x8xn<8>(s + 6 * srcStride, 1, h_s);
+        filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[6]);
+        v_s[6] = vget_low_s16(t_v_s[6]);
+
+        s += 7 * srcStride;
+
+        for (int row = 0; row < height; row += 4)
+        {
+            int32x4_t sum[4];
+            uint8x8_t res[2];
+
+            load_u8x8xn<8>(s + 0 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[7]);
+            v_s[7] = vget_low_s16(t_v_s[7]);
+            filter8_s16x4<coeffIdy>(v_s + 0, v_filter, v_offset, sum[0]);
+            v_s[0] = v_s[4];
+
+            load_u8x8xn<8>(s + 1 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[8]);
+            v_s[8] = vget_low_s16(t_v_s[8]);
+            filter8_s16x4<coeffIdy>(v_s + 1, v_filter, v_offset, sum[1]);
+            v_s[1] = v_s[5];
+
+            load_u8x8xn<8>(s + 2 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[9]);
+            v_s[9] = vget_low_s16(t_v_s[9]);
+            filter8_s16x4<coeffIdy>(v_s + 2, v_filter, v_offset, sum[2]);
+            v_s[2] = v_s[6];
+
+            load_u8x8xn<8>(s + 3 * srcStride, 1, h_s);
+            filter8_u8x8<coeffIdx>(h_s, h_offset, t_v_s[10]);
+            v_s[10] = vget_low_s16(t_v_s[10]);
+            filter8_s16x4<coeffIdy>(v_s + 3, v_filter, v_offset, sum[3]);
+            v_s[3] = v_s[7];
+
+            res[0] = vqshrun_n_s16(vtbl2q_s32_s16(sum[0], sum[1], shr_tbl),
+                                   v_shift_offset);
+            res[1] = vqshrun_n_s16(vtbl2q_s32_s16(sum[2], sum[3], shr_tbl),
+                                   v_shift_offset);
+
+            store_u8x4_strided_xN<4>(d + 0 * dstStride, dstStride, res);
+
+            v_s[4] = v_s[8];
+            v_s[5] = v_s[9];
+            v_s[6] = v_s[10];
+
+            s += 4 * srcStride;
+            d += 4 * dstStride;
+        }
+    }
+}
+
 #else // !HIGH_BIT_DEPTH
 
 #if X265_DEPTH == 10
@@ -4735,15 +5015,93 @@ void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst,
     }
 }
 
+#if HIGH_BIT_DEPTH
+template<int N, int width, int height>
+void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                       intptr_t dstStride, int idxX, int idxY)
+{
+    ALIGN_VAR_32(int16_t, immed[width * (height + N - 1)]);
+
+    interp_horiz_ps_neon<N, width, height>(src, srcStride, immed, width, idxX, 1);
+    interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst,
+                                          dstStride, idxY);
+}
+
+#else // HIGH_BIT_DEPTH
+
 template<int N, int width, int height>
-void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                       intptr_t dstStride, int idxX, int idxY)
 {
+// Use the merged hv paths with Clang only as performance with GCC is worse than the
+// existing approach of doing horizontal and vertical interpolation separately.
+#ifdef __clang__
+    switch (idxX)
+    {
+    case 1:
+    {
+        switch (idxY)
+        {
+        case 1:
+            return interp8_hv_pp_neon<1, 1, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 2:
+            return interp8_hv_pp_neon<1, 2, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 3:
+            return interp8_hv_pp_neon<1, 3, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        }
+
+        break;
+    }
+    case 2:
+    {
+        switch (idxY)
+        {
+        case 1:
+            return interp8_hv_pp_neon<2, 1, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 2:
+            return interp8_hv_pp_neon<2, 2, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 3:
+            return interp8_hv_pp_neon<2, 3, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        }
+
+        break;
+    }
+    case 3:
+    {
+        switch (idxY)
+        {
+        case 1:
+            return interp8_hv_pp_neon<3, 1, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 2:
+            return interp8_hv_pp_neon<3, 2, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        case 3:
+            return interp8_hv_pp_neon<3, 3, width, height>(src, srcStride, dst,
+                                                           dstStride);
+        }
+
+        break;
+    }
+    }
+
+#else // __clang__
     ALIGN_VAR_32(int16_t, immed[width * (height + N - 1)]);
 
     interp_horiz_ps_neon<N, width, height>(src, srcStride, immed, width, idxX, 1);
-    interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
+    interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst,
+                                          dstStride, idxY);
+#endif // __clang__
 }
 
+#endif // HIGH_BIT_DEPTH
+
 template<int width, int height>
 void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
 {
-- 
2.39.5 (Apple Git-154)