[vlc-devel] [PATCH 05/16] bitslice transform: rewrite
Serg Chernyavskiy
glenvt18 at gmail.com
Thu Jul 30 14:14:39 CEST 2015
Please review.
2015-06-26 14:19 GMT+03:00 glenvt18 <glenvt18 at gmail.com>:
> What has been done:
> 1. New optimized matrix transpose routines.
> 2. New BS_SWAPxx target-specific macros can be defined for further speed-up.
> 3. Separate 32 bit and all other stream transforms.
> 4. Packet data are read/written in native endianess. Endianess is now
> handled by transpose routines. No need for endianess conversion anymore.
> 5. Use unaligned 32-bit memory access for ARM if supported by the target.
> ---
> configure.ac | 15 ++-
> src/Makefile.am | 27 ++---
> src/dvbcsa_bs_stream.c | 4 +-
> src/dvbcsa_bs_transpose.c | 112 ------------------
> src/dvbcsa_bs_transpose.h | 71 ++++++++++++
> src/dvbcsa_bs_transpose128.c | 209 ---------------------------------
> src/dvbcsa_bs_transpose32.c | 185 -----------------------------
> src/dvbcsa_bs_transpose64.c | 186 -----------------------------
> src/dvbcsa_bs_transpose_block.c | 97 ++++++++++++++++
> src/dvbcsa_bs_transpose_stream.c | 231 +++++++++++++++++++++++++++++++++++++
> src/dvbcsa_bs_transpose_stream32.c | 150 ++++++++++++++++++++++++
> src/dvbcsa_pv.h | 78 +++++++++++--
> 12 files changed, 637 insertions(+), 728 deletions(-)
> delete mode 100644 src/dvbcsa_bs_transpose.c
> create mode 100644 src/dvbcsa_bs_transpose.h
> delete mode 100644 src/dvbcsa_bs_transpose128.c
> delete mode 100644 src/dvbcsa_bs_transpose32.c
> delete mode 100644 src/dvbcsa_bs_transpose64.c
> create mode 100644 src/dvbcsa_bs_transpose_block.c
> create mode 100644 src/dvbcsa_bs_transpose_stream.c
> create mode 100644 src/dvbcsa_bs_transpose_stream32.c
>
> diff --git a/configure.ac b/configure.ac
> index cefdf8a..4dd0726 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -32,23 +32,26 @@ AC_C_CONST
> AC_C_INLINE
> AC_CHECK_SIZEOF(long)
>
> +AC_C_BIGENDIAN(
> + AC_DEFINE(DVBCSA_ENDIAN_BIG, 1, [Target is big-endian]),
> + AC_DEFINE(DVBCSA_ENDIAN_LITTLE, 1, [Target is little-endian]),
> + AC_MSG_ERROR(unknown endianess),
> + AC_MSG_ERROR(universial endianess not supported)
> +)
> +
> if test "$enable_mmx" = "yes" ; then
> - transpose_64=yes
> AC_DEFINE(DVBCSA_USE_MMX, 1, Using MMX bitslice.)
> GCC_CFLAGS="$GCC_CFLAGS -mmmx"
>
> elif test "$enable_sse2" = "yes" ; then
> - transpose_128=yes
> AC_DEFINE(DVBCSA_USE_SSE, 1, Using SSE2 bitslice.)
> GCC_CFLAGS="$GCC_CFLAGS -msse -msse2"
>
> elif test "$enable_altivec" = "yes" ; then
> - transpose_128=yes
> AC_DEFINE(DVBCSA_USE_ALTIVEC, 1, Using AltiVec bitslice.)
> GCC_CFLAGS="$GCC_CFLAGS -maltivec -mabi=altivec"
>
> elif test "$enable_neon" = "yes" ; then
> - transpose_128=yes
> AC_DEFINE(DVBCSA_USE_NEON, 1, Using NEON bitslice.)
> GCC_CFLAGS="$GCC_CFLAGS -mfpu=neon"
>
> @@ -57,13 +60,11 @@ elif test "$enable_uint32" = "yes" ; then
> AC_DEFINE(DVBCSA_USE_UINT32, 1, Using 32 bits integer bitslice.)
>
> elif test "$enable_uint64" = "yes" ; then
> - transpose_64=yes
> AC_DEFINE(DVBCSA_USE_UINT64, 1, Using 64 bits integer bitslice.)
>
> else
> case $ac_cv_sizeof_long in
> 8)
> - transpose_64=yes
> AC_DEFINE(DVBCSA_USE_UINT64, 1, Using 64 bits integer bitslice.)
> ;;
> *)
> @@ -73,8 +74,6 @@ else
> esac
> fi
>
> -AM_CONDITIONAL(TRANSPOSE_128, test "$transpose_128" = "yes")
> -AM_CONDITIONAL(TRANSPOSE_64, test "$transpose_64" = "yes")
> AM_CONDITIONAL(TRANSPOSE_32, test "$transpose_32" = "yes")
>
> if test "$GCC" = "yes" ; then
> diff --git a/src/Makefile.am b/src/Makefile.am
> index 3bad07a..996ce4d 100644
> --- a/src/Makefile.am
> +++ b/src/Makefile.am
> @@ -3,23 +3,20 @@ SUBDIRS = dvbcsa
>
> lib_LTLIBRARIES = libdvbcsa.la
>
> -libdvbcsa_la_SOURCES = dvbcsa_algo.c dvbcsa_block.c dvbcsa_bs_algo.c \
> - dvbcsa_bs_block.c dvbcsa_bs_key.c dvbcsa_bs_stream.c \
> - dvbcsa_stream.c dvbcsa_bs.h dvbcsa_pv.h dvbcsa_bs_uint64.h \
> - dvbcsa_bs_uint32.h dvbcsa_bs_mmx.h dvbcsa_bs_sse.h \
> - dvbcsa_bs_altivec.h dvbcsa_bs_neon.h dvbcsa_bs_transpose.c dvbcsa_key.c \
> - dvbcsa_bs_stream_kernel.inc dvbcsa_bs_stream_kernel.h
> -
> -if TRANSPOSE_128
> -libdvbcsa_la_SOURCES += dvbcsa_bs_transpose128.c
> -endif
> -
> -if TRANSPOSE_64
> -libdvbcsa_la_SOURCES += dvbcsa_bs_transpose64.c
> -endif
> +libdvbcsa_la_SOURCES = \
> + dvbcsa_algo.c dvbcsa_block.c dvbcsa_stream.c dvbcsa_key.c \
> + dvbcsa_bs_algo.c dvbcsa_bs_block.c dvbcsa_bs_stream.c dvbcsa_bs_key.c \
> + dvbcsa_bs_transpose_block.c \
> + dvbcsa_bs_stream_kernel.inc dvbcsa_bs_stream_kernel.h \
> + dvbcsa_bs.h dvbcsa_pv.h dvbcsa_bs_transpose.h \
> + dvbcsa_bs_uint32.h ddvbcsa_bs_uint64.h \
> + dvbcsa_bs_mmx.h dvbcsa_bs_sse.h \
> + dvbcsa_bs_altivec.h dvbcsa_bs_neon.h
>
> if TRANSPOSE_32
> -libdvbcsa_la_SOURCES += dvbcsa_bs_transpose32.c
> +libdvbcsa_la_SOURCES += dvbcsa_bs_transpose_stream32.c
> +else
> +libdvbcsa_la_SOURCES += dvbcsa_bs_transpose_stream.c
> endif
>
> libdvbcsa_la_LDFLAGS = -version-info 1:1:0 $(libtool_flags)
> diff --git a/src/dvbcsa_bs_stream.c b/src/dvbcsa_bs_stream.c
> index 688a70d..41c0099 100644
> --- a/src/dvbcsa_bs_stream.c
> +++ b/src/dvbcsa_bs_stream.c
> @@ -80,9 +80,7 @@ dvbcsa_bs_stream_cipher_batch(const struct dvbcsa_bs_key_s *key,
> for (h = 8; h < maxlen; h += 8)
> {
> dvbcsa_bs_stream_cipher_kernel(®s);
> - for (i = 0; i < 8; i++)
> - dvbcsa_bs_stream_transpose_out(pcks, h + i, regs.cb + i * 8);
> -
> + dvbcsa_bs_stream_transpose_out(pcks, h, regs.cb);
> }
>
> }
> diff --git a/src/dvbcsa_bs_transpose.c b/src/dvbcsa_bs_transpose.c
> deleted file mode 100644
> index 57d208e..0000000
> --- a/src/dvbcsa_bs_transpose.c
> +++ /dev/null
> @@ -1,112 +0,0 @@
> -/*
> -
> - This file is part of libdvbcsa.
> -
> - libdvbcsa is free software; you can redistribute it and/or modify
> - it under the terms of the GNU General Public License as published
> - by the Free Software Foundation; either version 2 of the License,
> - or (at your option) any later version.
> -
> - libdvbcsa is distributed in the hope that it will be useful, but
> - WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - General Public License for more details.
> -
> - You should have received a copy of the GNU General Public License
> - along with libdvbcsa; if not, write to the Free Software
> - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
> - 02111-1307 USA
> -
> - Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
> -
> - (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
> -
> -*/
> -
> -#include "dvbcsa/dvbcsa.h"
> -#include "dvbcsa_bs.h"
> -
> -/***********************************************************************
> - Block cipher transpose
> - */
> -
> -void dvbcsa_bs_block_transpose_in (dvbcsa_bs_word_t *out,
> - const struct dvbcsa_bs_batch_s *pcks,
> - unsigned int offset)
> -{
> - uint32_t *ri = (uint32_t *) out;
> - unsigned int j, i, k;
> -
> - for (i = 0; pcks[i].data; i++)
> - if (offset < (pcks[i].len & (unsigned)~0x7))
> - {
> - ri[i ] = dvbcsa_load_le32(pcks[i].data + offset);
> - ri[i + BS_BATCH_SIZE] = dvbcsa_load_le32(pcks[i].data + offset + 4);
> - }
> -
> - for (j = 0; j < 64; j += 32)
> - for (i = 0; i < 16; i += 8)
> - for (k = 0; k < 8; k++)
> - {
> - dvbcsa_bs_word_t *r = out + j + i + k;
> - dvbcsa_bs_word_t t, b;
> -
> - t = r[0];
> - b = r[16];
> - r[0] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
> - r[16] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
> - }
> -
> - for (j = 0; j < 64; j += 16)
> - for (k = 0; k < 8; k++)
> - {
> - dvbcsa_bs_word_t *r = out + j + k;
> - dvbcsa_bs_word_t t, b;
> -
> - t = r[0];
> - b = r[8];
> - r[0] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
> - r[8] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
> - }
> -}
> -
> -void dvbcsa_bs_block_transpose_out (dvbcsa_bs_word_t *in,
> - const struct dvbcsa_bs_batch_s *pcks,
> - unsigned int offset)
> -{
> - uint32_t *ri = (uint32_t *) in;
> - unsigned int j, i, k;
> -
> - for (j = 0; j < 64; j += 16)
> - for (k = 0; k < 8; k++)
> - {
> - dvbcsa_bs_word_t *r = in + j + k;
> - dvbcsa_bs_word_t t, b;
> -
> - t = r[0];
> - b = r[8];
> - r[0] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
> - r[8] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
> - }
> -
> - for (j = 0; j < 64; j += 32)
> - for (i = 0; i < 16; i += 8)
> - for (k = 0; k < 8; k++)
> - {
> - dvbcsa_bs_word_t *r = in + j + i + k;
> - dvbcsa_bs_word_t t, b;
> -
> - t = r[0];
> - b = r[16];
> - r[0] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
> - r[16] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
> - }
> -
> - for (i = 0; pcks[i].data; i++)
> - if (offset < (pcks[i].len & (unsigned)~0x7))
> - {
> - dvbcsa_store_le32(pcks[i].data + offset , ri[i ]);
> - dvbcsa_store_le32(pcks[i].data + offset + 4, ri[i + BS_BATCH_SIZE]);
> - }
> -}
> -
> diff --git a/src/dvbcsa_bs_transpose.h b/src/dvbcsa_bs_transpose.h
> new file mode 100644
> index 0000000..56ecf47
> --- /dev/null
> +++ b/src/dvbcsa_bs_transpose.h
> @@ -0,0 +1,71 @@
> +#ifndef DVBCSA_BS_TRANSPOSE_H_
> +#define DVBCSA_BS_TRANSPOSE_H_
> +
> +#include "dvbcsa_bs.h"
> +
> +/*
> + 2x2 matrix transpose swap operation:
> +
> + t = |a b c d| => |f b h d|
> + b = |e f g h| => |e a g c|
> +
> + 'a' and 'e' are MSB of dvbcsa_bs_word_t.
> + (little-endian transpose)
> +
> + tmp = (b ^ (t>>j)) & m;
> + b = b ^ tmp;
> + t = t ^ (tmp<<j);
> +*/
> +
> +#define BS_SWAP_BITS_LE(t, b, shift, mask) \
> + { \
> + dvbcsa_bs_word_t tmp; \
> + tmp = BS_AND(BS_XOR(BS_SHR(t, shift), b), mask); \
> + (b) = BS_XOR((b), tmp); \
> + (t) = BS_XOR((t), BS_SHL(tmp, shift)); \
> + }
> +
> +#ifndef BS_SWAP32_LE
> +#define BS_SWAP32_LE(t, b) BS_SWAP_BITS_LE(t, b, 32, BS_VAL64(00000000ffffffff))
> +#endif
> +
> +#ifndef BS_SWAP16_LE
> +#define BS_SWAP16_LE(t, b) BS_SWAP_BITS_LE(t, b, 16, BS_VAL32(0000ffff))
> +#endif
> +
> +#ifndef BS_SWAP8_LE
> +#define BS_SWAP8_LE(t, b) BS_SWAP_BITS_LE(t, b, 8, BS_VAL16(00ff))
> +#endif
> +
> +#ifndef BS_SWAP4_LE
> +#define BS_SWAP4_LE(t, b) BS_SWAP_BITS_LE(t, b, 4, BS_VAL8(0f))
> +#endif
> +
> +#ifndef BS_SWAP2_LE
> +#define BS_SWAP2_LE(t, b) BS_SWAP_BITS_LE(t, b, 2, BS_VAL8(33))
> +#endif
> +
> +#ifndef BS_SWAP1_LE
> +#define BS_SWAP1_LE(t, b) BS_SWAP_BITS_LE(t, b, 1, BS_VAL8(55))
> +#endif
> +
> +#define BS_SWAP4(t, b) BS_SWAP4_LE(t, b)
> +#define BS_SWAP2(t, b) BS_SWAP2_LE(t, b)
> +#define BS_SWAP1(t, b) BS_SWAP1_LE(t, b)
> +
> +#ifdef DVBCSA_ENDIAN_LITTLE
> +
> +#define BS_SWAP32(t, b) BS_SWAP32_LE(t, b)
> +#define BS_SWAP16(t, b) BS_SWAP16_LE(t, b)
> +#define BS_SWAP8(t, b) BS_SWAP8_LE(t, b)
> +
> +#else
> +
> +#define BS_SWAP32(t, b) BS_SWAP32_LE(b, t)
> +#define BS_SWAP16(t, b) BS_SWAP16_LE(b, t)
> +#define BS_SWAP8(t, b) BS_SWAP8_LE(b, t)
> +
> +#endif
> +
> +#endif
> +
> diff --git a/src/dvbcsa_bs_transpose128.c b/src/dvbcsa_bs_transpose128.c
> deleted file mode 100644
> index 8a75d09..0000000
> --- a/src/dvbcsa_bs_transpose128.c
> +++ /dev/null
> @@ -1,209 +0,0 @@
> -/*
> -
> - This file is part of libdvbcsa.
> -
> - libdvbcsa is free software; you can redistribute it and/or modify
> - it under the terms of the GNU General Public License as published
> - by the Free Software Foundation; either version 2 of the License,
> - or (at your option) any later version.
> -
> - libdvbcsa is distributed in the hope that it will be useful, but
> - WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - General Public License for more details.
> -
> - You should have received a copy of the GNU General Public License
> - along with libdvbcsa; if not, write to the Free Software
> - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
> - 02111-1307 USA
> -
> - Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
> -
> - (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
> -
> -*/
> -
> -#include "dvbcsa/dvbcsa.h"
> -#include "dvbcsa_bs.h"
> -
> -/***********************************************************************
> - Stream cipher transpose
> - */
> -
> -/* 64 rows of 64 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
> -
> -void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
> -{
> - int i, j;
> -
> - for (i = 0; pcks->data; i++)
> - {
> - uint64_t t, b;
> -
> - if (pcks->data)
> - {
> - if (pcks->len >= 8)
> - t = dvbcsa_load_le64(pcks->data);
> - pcks++;
> - }
> -
> - if (pcks->data)
> - {
> - if (pcks->len >= 8)
> - b = dvbcsa_load_le64(pcks->data);
> - pcks++;
> - }
> -
> - row[i] = BS_VAL(b, t);
> - }
> -
> - for (i = 0; i < 32; i++)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - t = row[i];
> - b = row[32 + i];
> - row[i] = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(BS_AND(b, BS_VAL64(00000000ffffffff)), 4));
> - row[32 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(BS_AND(t, BS_VAL64(ffffffff00000000)), 4));
> - }
> -
> - for (j = 0; j < 64; j += 32)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 16; i++)
> - {
> - t = row[j + i];
> - b = row[j + 16 + i];
> - row[j + i] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
> - row[j + 16 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 16)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 8; i++)
> - {
> - t = row[j + i];
> - b = row[j + 8 + i];
> - row[j + i] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
> - row[j + 8 + i] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 8)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 4; i++)
> - {
> - b = row[j + i];
> - t = row[j + 4 + i];
> - row[j + i] = BS_OR(BS_AND(b, BS_VAL8(0f)), BS_SHL(BS_AND(t, BS_VAL8(0f)), 4));
> - row[j + 4 + i] = BS_OR(BS_AND(t, BS_VAL8(f0)), BS_SHR(BS_AND(b, BS_VAL8(f0)), 4));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 4)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 2; i++)
> - {
> - b = row[j + i];
> - t = row[j + 2 + i];
> - row[j + i] = BS_OR(BS_AND(b, BS_VAL8(33)), BS_SHL(BS_AND(t, BS_VAL8(33)), 2));
> - row[j + 2 + i] = BS_OR(BS_AND(t, BS_VAL8(cc)), BS_SHR(BS_AND(b, BS_VAL8(cc)), 2));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 2)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - b = row[j];
> - t = row[j + 1];
> - row[j] = BS_OR(BS_AND(b, BS_VAL8(55)), BS_SHL(BS_AND(t, BS_VAL8(55)), 1));
> - row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(aa)), BS_SHR(BS_AND(b, BS_VAL8(aa)), 1));
> - }
> -}
> -
> -/* 8 rows of 64 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
> -
> -void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
> - unsigned int index, dvbcsa_bs_word_t *row)
> -{
> - int i, j;
> -
> - for (i = 0; i < 4; i++)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - t = row[i];
> - b = row[4 + i];
> - row[i] = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(BS_AND(b, BS_VAL64(00000000ffffffff)), 4));
> - row[4 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(BS_AND(t, BS_VAL64(ffffffff00000000)), 4));
> - }
> -
> - for (j = 0; j < 8; j += 4)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 2; i++)
> - {
> - t = row[j + i];
> - b = row[j + 2 + i];
> - row[j + i] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
> - row[j + 2 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
> - }
> - }
> -
> - for (j = 0; j < 8; j += 2)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - t = row[j];
> - b = row[j + 1];
> - row[j] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
> - row[j + 1] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
> - }
> -
> - for (j = 0; j < 8; j++)
> - {
> - dvbcsa_bs_word_t t;
> -
> - t = row[j];
> -
> - t = BS_OR( BS_AND(t, BS_VAL64(f0f0f0f00f0f0f0f)),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL64(0f0f0f0f00000000)), 28),
> - BS_SHL(BS_AND(t, BS_VAL64(00000000f0f0f0f0)), 28)));
> -
> - t = BS_OR( BS_AND(t, BS_VAL32( cccc3333)),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL32( 33330000)), 14),
> - BS_SHL(BS_AND(t, BS_VAL32( 0000cccc)), 14)));
> -
> - t = BS_OR( BS_AND(t, BS_VAL16( aa55)),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL16( 5500)), 7 ),
> - BS_SHL(BS_AND(t, BS_VAL16( 00aa)), 7 )));
> -
> - for (i = 0; i < BS_BATCH_BYTES; i++)
> - {
> - static const unsigned int p[16] =
> - {
> - 0 , 8, 1, 9, 2, 10, 3, 11,
> - 4, 12, 5, 13, 6, 14, 7, 15
> - };
> -
> - unsigned int k = j * BS_BATCH_BYTES + i;
> -
> - if (!pcks[k].data)
> - return;
> -
> - if (index < pcks[k].len)
> - pcks[k].data[index] ^= BS_EXTRACT8(t, p[i]);
> - }
> - }
> -}
> -
> diff --git a/src/dvbcsa_bs_transpose32.c b/src/dvbcsa_bs_transpose32.c
> deleted file mode 100644
> index 2cfff7a..0000000
> --- a/src/dvbcsa_bs_transpose32.c
> +++ /dev/null
> @@ -1,185 +0,0 @@
> -/*
> -
> - This file is part of libdvbcsa.
> -
> - libdvbcsa is free software; you can redistribute it and/or modify
> - it under the terms of the GNU General Public License as published
> - by the Free Software Foundation; either version 2 of the License,
> - or (at your option) any later version.
> -
> - libdvbcsa is distributed in the hope that it will be useful, but
> - WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - General Public License for more details.
> -
> - You should have received a copy of the GNU General Public License
> - along with libdvbcsa; if not, write to the Free Software
> - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
> - 02111-1307 USA
> -
> - Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
> -
> - (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
> -
> -*/
> -
> -#include "dvbcsa/dvbcsa.h"
> -#include "dvbcsa_bs.h"
> -
> -/***********************************************************************
> - Stream cipher transpose
> - */
> -
> -/* 64 rows of 32 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
> -
> -void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
> -{
> - int i, j;
> -
> - for (i = 0; pcks[i].data; i++)
> - if (pcks[i].len >= 8)
> - {
> - row[i ] = BS_VAL(dvbcsa_load_le32(pcks[i].data ));
> - row[i + 32] = BS_VAL(dvbcsa_load_le32(pcks[i].data + 4));
> - }
> -
> - for (j = 0; j < 64; j += 32)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 16; i++)
> - {
> - t = row[j + i];
> - b = row[j + 16 + i];
> - row[j + i ] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
> - row[j + 16 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 16)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 8; i++)
> - {
> - t = row[j + i];
> - b = row[j + 8 + i];
> - row[j + i ] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
> - row[j + 8 + i] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 8)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 4; i++)
> - {
> - t = row[j + i];
> - b = row[j + 4 + i];
> - row[j + i ] = BS_OR(BS_AND(b, BS_VAL8(0f)), BS_SHL(BS_AND(t, BS_VAL8(0f)), 4));
> - row[j + 4 + i] = BS_OR(BS_AND(t, BS_VAL8(f0)), BS_SHR(BS_AND(b, BS_VAL8(f0)), 4));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 4)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 2; i++)
> - {
> - t = row[j + i];
> - b = row[j + 2 + i];
> - row[j + i ] = BS_OR(BS_AND(b, BS_VAL8(33)), BS_SHL(BS_AND(t, BS_VAL8(33)), 2));
> - row[j + 2 + i] = BS_OR(BS_AND(t, BS_VAL8(cc)), BS_SHR(BS_AND(b, BS_VAL8(cc)), 2));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 2)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - t = row[j];
> - b = row[j + 1];
> - row[j ] = BS_OR(BS_AND(b, BS_VAL8(55)), BS_SHL(BS_AND(t, BS_VAL8(55)), 1));
> - row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(aa)), BS_SHR(BS_AND(b, BS_VAL8(aa)), 1));
> - }
> -}
> -
> -/* 64 rows of 32 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
> -
> -void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
> - unsigned int index, dvbcsa_bs_word_t *row)
> -{
> - int i, j;
> -
> - for (i = 0; i < 4; i++)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - t = row[i];
> - b = row[4 + i];
> - row[i ] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
> - row[4 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
> - }
> -
> - for (j = 0; j < 8; j += 4)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 2; i++)
> - {
> - t = row[j + i];
> - b = row[j + 2 + i];
> - row[j + i ] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
> - row[j + i + 2] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
> - }
> - }
> -
> - for (j = 0; j < 8; j += 2)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - t = row[j];
> - b = row[j + 1];
> - row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(0f)), BS_SHL(BS_AND(b, BS_VAL8(0f)), 4)); //(t & 0x0f0f0f0f) | ((b & 0x0f0f0f0f) << 4);
> - row[j ] = BS_OR(BS_AND(b, BS_VAL8(f0)), BS_SHR(BS_AND(t, BS_VAL8(f0)), 4));//((t & 0xf0f0f0f0) >> 4) | (b & 0xf0f0f0f0);
> - }
> -
> - for (j = 0; j < 8; j++)
> - {
> - dvbcsa_bs_word_t t;
> -
> - t = row[j];
> -
> - t = BS_OR( BS_AND(t, BS_VAL32(cccc3333)),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL32(33330000)), 14),
> - BS_SHL(BS_AND(t, BS_VAL32(0000cccc)), 14)));
> -
> - t = BS_OR( BS_AND(t, BS_VAL16( aa55)),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL16( 5500)), 7 ),
> - BS_SHL(BS_AND(t, BS_VAL16( 00aa)), 7 )));
> -
> - t = BS_OR( BS_AND(t, BS_VAL8 ( 81)),
> -
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL8 ( 10)), 3 ),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL8 ( 20)), 2 ),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL8 ( 40)), 1 ),
> -
> - BS_OR(BS_SHL(BS_AND(t, BS_VAL8 ( 02)), 1 ),
> - BS_OR(BS_SHL(BS_AND(t, BS_VAL8 ( 04)), 2 ),
> - BS_SHL(BS_AND(t, BS_VAL8 ( 08)), 3 )))))));
> -
> - for (i = 0; i < 4; i++)
> - {
> - unsigned int k = j * 4 + i;
> -
> - if (!pcks[k].data)
> - return;
> -
> - if (index < pcks[k].len)
> - pcks[k].data[index] ^= BS_EXTRACT8(t, 3 - i);
> - }
> - }
> -}
> -
> diff --git a/src/dvbcsa_bs_transpose64.c b/src/dvbcsa_bs_transpose64.c
> deleted file mode 100644
> index c75127b..0000000
> --- a/src/dvbcsa_bs_transpose64.c
> +++ /dev/null
> @@ -1,186 +0,0 @@
> -/*
> -
> - This file is part of libdvbcsa.
> -
> - libdvbcsa is free software; you can redistribute it and/or modify
> - it under the terms of the GNU General Public License as published
> - by the Free Software Foundation; either version 2 of the License,
> - or (at your option) any later version.
> -
> - libdvbcsa is distributed in the hope that it will be useful, but
> - WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - General Public License for more details.
> -
> - You should have received a copy of the GNU General Public License
> - along with libdvbcsa; if not, write to the Free Software
> - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
> - 02111-1307 USA
> -
> - Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
> -
> - (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
> -
> -*/
> -
> -#include "dvbcsa/dvbcsa.h"
> -#include "dvbcsa_bs.h"
> -
> -/***********************************************************************
> - Stream cipher transpose
> - */
> -
> -/* 64 rows of 64 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
> -
> -void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
> -{
> - int i, j;
> -
> - for (i = 0; pcks[i].data; i++)
> - if (pcks[i].len >= 8)
> - row[i] = BS_VAL(dvbcsa_load_le64(pcks[i].data));
> -
> - for (i = 0; i < 32; i++)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - t = row[i];
> - b = row[32 + i];
> - row[i] = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(b, 4));
> - row[32 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(t, 4));
> - }
> -
> - for (j = 0; j < 64; j += 32)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 16; i++)
> - {
> - t = row[j + i];
> - b = row[j + 16 + i];
> - row[j + i] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
> - row[j + 16 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 16)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 8; i++)
> - {
> - t = row[j + i];
> - b = row[j + 8 + i];
> - row[j + i] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
> - row[j + 8 + i] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 8)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 4; i++)
> - {
> - b = row[j + i];
> - t = row[j + 4 + i];
> - row[j + i] = BS_OR(BS_AND(b, BS_VAL8(0f)), BS_SHL(BS_AND(t, BS_VAL8(0f)), 4));
> - row[j + 4 + i] = BS_OR(BS_AND(t, BS_VAL8(f0)), BS_SHR(BS_AND(b, BS_VAL8(f0)), 4));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 4)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 2; i++)
> - {
> - b = row[j + i];
> - t = row[j + 2 + i];
> - row[j + i] = BS_OR(BS_AND(b, BS_VAL8(33)), BS_SHL(BS_AND(t, BS_VAL8(33)), 2));
> - row[j + 2 + i] = BS_OR(BS_AND(t, BS_VAL8(cc)), BS_SHR(BS_AND(b, BS_VAL8(cc)), 2));
> - }
> - }
> -
> - for (j = 0; j < 64; j += 2)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - b = row[j];
> - t = row[j + 1];
> - row[j] = BS_OR(BS_AND(b, BS_VAL8(55)), BS_SHL(BS_AND(t, BS_VAL8(55)), 1));
> - row[j + 1] = BS_OR(BS_AND(t, BS_VAL8(aa)), BS_SHR(BS_AND(b, BS_VAL8(aa)), 1));
> - }
> -}
> -
> -/* 8 rows of 64 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
> -
> -void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
> - unsigned int index, dvbcsa_bs_word_t *row)
> -{
> - int i, j;
> -
> - for (i = 0; i < 4; i++)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - t = row[i];
> - b = row[4 + i];
> - row[i] = BS_OR(BS_AND(t, BS_VAL64(00000000ffffffff)), BS_SHL8(b, 4));
> - row[4 + i] = BS_OR(BS_AND(b, BS_VAL64(ffffffff00000000)), BS_SHR8(t, 4));
> - }
> -
> - for (j = 0; j < 8; j += 4)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - for (i = 0; i < 2; i++)
> - {
> - t = row[j + i];
> - b = row[j + 2 + i];
> - row[j + i] = BS_OR(BS_AND(t, BS_VAL32(0000ffff)), BS_SHL8(BS_AND(b, BS_VAL32(0000ffff)), 2));
> - row[j + 2 + i] = BS_OR(BS_AND(b, BS_VAL32(ffff0000)), BS_SHR8(BS_AND(t, BS_VAL32(ffff0000)), 2));
> - }
> - }
> -
> - for (j = 0; j < 8; j += 2)
> - {
> - dvbcsa_bs_word_t t, b;
> -
> - t = row[j];
> - b = row[j + 1];
> - row[j] = BS_OR(BS_AND(t, BS_VAL16(00ff)), BS_SHL8(BS_AND(b, BS_VAL16(00ff)), 1));
> - row[j + 1] = BS_OR(BS_AND(b, BS_VAL16(ff00)), BS_SHR8(BS_AND(t, BS_VAL16(ff00)), 1));
> - }
> -
> - for (j = 0; j < 8; j++)
> - {
> - dvbcsa_bs_word_t t;
> -
> - t = row[j];
> -
> - t = BS_OR( BS_AND(t, BS_VAL64(f0f0f0f00f0f0f0f)),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL64(0f0f0f0f00000000)), 28),
> - BS_SHL(BS_AND(t, BS_VAL64(00000000f0f0f0f0)), 28)));
> -
> - t = BS_OR( BS_AND(t, BS_VAL32( cccc3333)),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL32( 33330000)), 14),
> - BS_SHL(BS_AND(t, BS_VAL32( 0000cccc)), 14)));
> -
> - t = BS_OR( BS_AND(t, BS_VAL16( aa55)),
> - BS_OR(BS_SHR(BS_AND(t, BS_VAL16( 5500)), 7 ),
> - BS_SHL(BS_AND(t, BS_VAL16( 00aa)), 7 )));
> -
> - for (i = 0; i < BS_BATCH_BYTES; i++)
> - {
> - unsigned int k = j * BS_BATCH_BYTES + i;
> -
> - if (!pcks[k].data)
> - return;
> -
> - if (index < pcks[k].len)
> - pcks[k].data[index] ^= BS_EXTRACT8(t, i);
> - }
> - }
> -}
> -
> diff --git a/src/dvbcsa_bs_transpose_block.c b/src/dvbcsa_bs_transpose_block.c
> new file mode 100644
> index 0000000..5dc4472
> --- /dev/null
> +++ b/src/dvbcsa_bs_transpose_block.c
> @@ -0,0 +1,97 @@
> +/*
> +
> + This file is part of libdvbcsa.
> +
> + libdvbcsa is free software; you can redistribute it and/or modify
> + it under the terms of the GNU General Public License as published
> + by the Free Software Foundation; either version 2 of the License,
> + or (at your option) any later version.
> +
> + libdvbcsa is distributed in the hope that it will be useful, but
> + WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + General Public License for more details.
> +
> + You should have received a copy of the GNU General Public License
> + along with libdvbcsa; if not, write to the Free Software
> + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
> + 02111-1307 USA
> +
> + Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
> +
> + (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
> +
> +*/
> +
> +#include "dvbcsa/dvbcsa.h"
> +#include "dvbcsa_bs.h"
> +#include "dvbcsa_bs_transpose.h"
> +
> +/*
> + Block cipher transpose
> +*/
> +
> +DVBCSA_INLINE static inline void
> +dvbcsa_bs_matrix_transpose_block(dvbcsa_bs_word_t *row)
> +{
> + int j;
> +
> + for (j = 0; j < 64; j += 32)
> + {
> + int i;
> + for (i = 0; i < 8; i++)
> + {
> + dvbcsa_bs_word_t a0, a1, a2, a3;
> +
> + a0 = row[j + i + 0];
> + a1 = row[j + i + 8];
> + a2 = row[j + i + 16];
> + a3 = row[j + i + 24];
> +
> + BS_SWAP8(a0, a1);
> + BS_SWAP8(a2, a3);
> + BS_SWAP16(a0, a2);
> + BS_SWAP16(a1, a3);
> +
> + row[j + i + 0] = a0;
> + row[j + i + 8] = a1;
> + row[j + i + 16] = a2;
> + row[j + i + 24] = a3;
> + }
> + }
> +}
> +
> +void dvbcsa_bs_block_transpose_in (dvbcsa_bs_word_t *out,
> + const struct dvbcsa_bs_batch_s *pcks,
> + unsigned int offset)
> +{
> + uint32_t *ri = (uint32_t *)out;
> + unsigned int i;
> +
> + for (i = 0; pcks[i].data; i++)
> + if (offset < (pcks[i].len & (unsigned)~0x7))
> + {
> + dvbcsa_copy_32((uint8_t *)(ri + i), pcks[i].data + offset);
> + dvbcsa_copy_32((uint8_t *)(ri + i + BS_BATCH_SIZE), pcks[i].data + offset + 4);
> + }
> +
> + dvbcsa_bs_matrix_transpose_block(out);
> +}
> +
> +void dvbcsa_bs_block_transpose_out (dvbcsa_bs_word_t *in,
> + const struct dvbcsa_bs_batch_s *pcks,
> + unsigned int offset)
> +{
> + uint32_t *ri = (uint32_t *) in;
> + unsigned int i;
> +
> + dvbcsa_bs_matrix_transpose_block(in);
> +
> + for (i = 0; pcks[i].data; i++)
> + if (offset < (pcks[i].len & (unsigned)~0x7))
> + {
> + dvbcsa_copy_32(pcks[i].data + offset, (uint8_t *)(ri + i));
> + dvbcsa_copy_32(pcks[i].data + offset + 4, (uint8_t *)(ri + i + BS_BATCH_SIZE));
> + }
> +}
> +
> diff --git a/src/dvbcsa_bs_transpose_stream.c b/src/dvbcsa_bs_transpose_stream.c
> new file mode 100644
> index 0000000..8fc1a2d
> --- /dev/null
> +++ b/src/dvbcsa_bs_transpose_stream.c
> @@ -0,0 +1,231 @@
> +/*
> +
> + This file is part of libdvbcsa.
> +
> + libdvbcsa is free software; you can redistribute it and/or modify
> + it under the terms of the GNU General Public License as published
> + by the Free Software Foundation; either version 2 of the License,
> + or (at your option) any later version.
> +
> + libdvbcsa is distributed in the hope that it will be useful, but
> + WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + General Public License for more details.
> +
> + You should have received a copy of the GNU General Public License
> + along with libdvbcsa; if not, write to the Free Software
> + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
> + 02111-1307 USA
> +
> + Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
> +
> + (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
> +
> +*/
> +
> +#include "dvbcsa/dvbcsa.h"
> +#include "dvbcsa_bs.h"
> +#include "dvbcsa_bs_transpose.h"
> +
> +/*
> + Stream cipher transpose for dvbcsa_bs_word_t sizes = 64, 128, 256, ... bits.
> +*/
> +
> +static void dvbcsa_bs_matrix_transpose_64x(dvbcsa_bs_word_t *row)
> +{
> + int j;
> +
> +#if defined(__i386__)
> +
> + /* short of registers */
> +
> + for (j = 0; j < 16; j++)
> + {
> + dvbcsa_bs_word_t a0, a1, a2, a3;
> +
> + a0 = row[j + 0];
> + a1 = row[j + 16];
> + a2 = row[j + 32];
> + a3 = row[j + 48];
> +
> + BS_SWAP16(a0, a1);
> + BS_SWAP16(a2, a3);
> + BS_SWAP32(a0, a2);
> + BS_SWAP32(a1, a3);
> +
> + row[j + 0] = a0;
> + row[j + 16] = a1;
> + row[j + 32] = a2;
> + row[j + 48] = a3;
> + }
> +
> + for (j = 0; j < 64; j += 16)
> + {
> + int i;
> + for (i = 0; i < 4; i++)
> + {
> + dvbcsa_bs_word_t a0, a1, a2, a3;
> +
> + a0 = row[j + i + 0];
> + a1 = row[j + i + 4];
> + a2 = row[j + i + 8];
> + a3 = row[j + i + 12];
> +
> + BS_SWAP4(a0, a1);
> + BS_SWAP4(a2, a3);
> + BS_SWAP8(a0, a2);
> + BS_SWAP8(a1, a3);
> +
> + row[j + i + 0] = a0;
> + row[j + i + 4] = a1;
> + row[j + i + 8] = a2;
> + row[j + i + 12] = a3;
> + }
> + }
> +
> + for (j = 0; j < 64; j += 4)
> + {
> + dvbcsa_bs_word_t a0, a1, a2, a3;
> +
> + a0 = row[j + 0];
> + a1 = row[j + 1];
> + a2 = row[j + 2];
> + a3 = row[j + 3];
> +
> + BS_SWAP1(a0, a1);
> + BS_SWAP1(a2, a3);
> + BS_SWAP2(a0, a2);
> + BS_SWAP2(a1, a3);
> +
> + row[j + 0] = a0;
> + row[j + 1] = a1;
> + row[j + 2] = a2;
> + row[j + 3] = a3;
> + }
> +
> +#else
> +
> + for (j = 0; j < 64; j += 8)
> + {
> + dvbcsa_bs_word_t a0, a1, a2, a3, b0, b1, b2, b3;
> +
> + a0 = row[j + 0];
> + a1 = row[j + 1];
> + a2 = row[j + 2];
> + a3 = row[j + 3];
> +
> + b0 = row[j + 4];
> + b1 = row[j + 5];
> + b2 = row[j + 6];
> + b3 = row[j + 7];
> +
> + BS_SWAP1(a0, a1);
> + BS_SWAP1(a2, a3);
> + BS_SWAP1(b0, b1);
> + BS_SWAP1(b2, b3);
> +
> + BS_SWAP2(a0, a2);
> + BS_SWAP2(a1, a3);
> + BS_SWAP2(b0, b2);
> + BS_SWAP2(b1, b3);
> +
> + BS_SWAP4(a0, b0);
> + BS_SWAP4(a1, b1);
> + BS_SWAP4(a2, b2);
> + BS_SWAP4(a3, b3);
> +
> + row[j + 0] = a0;
> + row[j + 1] = a1;
> + row[j + 2] = a2;
> + row[j + 3] = a3;
> +
> + row[j + 4] = b0;
> + row[j + 5] = b1;
> + row[j + 6] = b2;
> + row[j + 7] = b3;
> + }
> +
> + for (j = 0; j < 8; j++)
> + {
> + dvbcsa_bs_word_t a0, a1, a2, a3, b0, b1, b2, b3;
> +
> + a0 = row[j + 0];
> + a1 = row[j + 8];
> + a2 = row[j + 16];
> + a3 = row[j + 24];
> +
> + b0 = row[j + 32];
> + b1 = row[j + 40];
> + b2 = row[j + 48];
> + b3 = row[j + 56];
> +
> + BS_SWAP8(a0, a1);
> + BS_SWAP8(a2, a3);
> + BS_SWAP8(b0, b1);
> + BS_SWAP8(b2, b3);
> +
> + BS_SWAP16(a0, a2);
> + BS_SWAP16(a1, a3);
> + BS_SWAP16(b0, b2);
> + BS_SWAP16(b1, b3);
> +
> + BS_SWAP32(a0, b0);
> + BS_SWAP32(a1, b1);
> + BS_SWAP32(a2, b2);
> + BS_SWAP32(a3, b3);
> +
> + row[j + 0] = a0;
> + row[j + 8] = a1;
> + row[j + 16] = a2;
> + row[j + 24] = a3;
> +
> + row[j + 32] = b0;
> + row[j + 40] = b1;
> + row[j + 48] = b2;
> + row[j + 56] = b3;
> + }
> +
> +#endif
> +}
> +
> +
> +/* 64 rows of 64 bits transposition (bytes transp. - 8x8 rotate counterclockwise)*/
> +
> +void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
> +{
> + uint64_t *p;
> +
> + for (p = (uint64_t *)row; pcks->data; p++, pcks++)
> + {
> + if (pcks->len >= 8)
> + dvbcsa_copy_64((uint8_t *)p, pcks->data);
> + }
> +
> + dvbcsa_bs_matrix_transpose_64x(row);
> +}
> +
> +/* 8 rows of 64 bits transposition (bytes transp. - 8x8 rotate clockwise)*/
> +
> +void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
> + unsigned int index, dvbcsa_bs_word_t *row)
> +{
> + int i, j;
> + uint8_t *p;
> +
> + dvbcsa_bs_matrix_transpose_64x(row);
> +
> + for (p = (uint8_t *)row; pcks->data; pcks++)
> + {
> + if (index + 8 <= pcks->len)
> + {
> + dvbcsa_xor_64(pcks->data + index, p);
> + }
> + else
> + {
> + for (j = 0, i = index; i < pcks->len; i++, j++)
> + pcks->data[i] ^= p[j];
> + }
> + p += 8;
> + }
> +}
> +
> diff --git a/src/dvbcsa_bs_transpose_stream32.c b/src/dvbcsa_bs_transpose_stream32.c
> new file mode 100644
> index 0000000..fc2f847
> --- /dev/null
> +++ b/src/dvbcsa_bs_transpose_stream32.c
> @@ -0,0 +1,150 @@
> +/*
> +
> + This file is part of libdvbcsa.
> +
> + libdvbcsa is free software; you can redistribute it and/or modify
> + it under the terms of the GNU General Public License as published
> + by the Free Software Foundation; either version 2 of the License,
> + or (at your option) any later version.
> +
> + libdvbcsa is distributed in the hope that it will be useful, but
> + WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + General Public License for more details.
> +
> + You should have received a copy of the GNU General Public License
> + along with libdvbcsa; if not, write to the Free Software
> + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
> + 02111-1307 USA
> +
> + Based on FFdecsa, Copyright (C) 2003-2004 fatih89r
> +
> + (c) 2006-2008 Alexandre Becoulet <alexandre.becoulet at free.fr>
> +
> +*/
> +
> +#include "dvbcsa/dvbcsa.h"
> +#include "dvbcsa_bs.h"
> +#include "dvbcsa_bs_transpose.h"
> +
> +/*
> + Stream cipher transpose for dvbcsa_bs_word_t size = 32 bits.
> +*/
> +
> +static void dvbcsa_bs_matrix_transpose_64x32(dvbcsa_bs_word_t *row)
> +{
> + int j;
> +
> + for (j = 0; j < 16; j++)
> + {
> + dvbcsa_bs_word_t a0, a1, a2, a3;
> +
> + a0 = row[j + 0];
> + a1 = row[j + 16];
> + a2 = row[j + 32];
> + a3 = row[j + 48];
> +
> + BS_SWAP16(a0, a1);
> + BS_SWAP16(a2, a3);
> +
> + row[j + 0] = a0;
> + row[j + 16] = a1;
> + row[j + 32] = a2;
> + row[j + 48] = a3;
> + }
> +
> + for (j = 0; j < 64; j += 16)
> + {
> + int i;
> + for (i = 0; i < 4; i++)
> + {
> + dvbcsa_bs_word_t a0, a1, a2, a3;
> +
> + a0 = row[j + i + 0];
> + a1 = row[j + i + 4];
> + a2 = row[j + i + 8];
> + a3 = row[j + i + 12];
> +
> + BS_SWAP4(a0, a1);
> + BS_SWAP4(a2, a3);
> + BS_SWAP8(a0, a2);
> + BS_SWAP8(a1, a3);
> +
> + row[j + i + 0] = a0;
> + row[j + i + 4] = a1;
> + row[j + i + 8] = a2;
> + row[j + i + 12] = a3;
> + }
> + }
> +
> + for (j = 0; j < 64; j += 4)
> + {
> + dvbcsa_bs_word_t a0, a1, a2, a3;
> +
> + a0 = row[j + 0];
> + a1 = row[j + 1];
> + a2 = row[j + 2];
> + a3 = row[j + 3];
> +
> + BS_SWAP1(a0, a1);
> + BS_SWAP1(a2, a3);
> + BS_SWAP2(a0, a2);
> + BS_SWAP2(a1, a3);
> +
> + row[j + 0] = a0;
> + row[j + 1] = a1;
> + row[j + 2] = a2;
> + row[j + 3] = a3;
> + }
> +}
> +
> +void dvbcsa_bs_stream_transpose_in(const struct dvbcsa_bs_batch_s *pcks, dvbcsa_bs_word_t *row)
> +{
> + int i;
> +
> + for (i = 0; pcks[i].data; i++)
> + if (pcks[i].len >= 8)
> + {
> + dvbcsa_copy_32((uint8_t *)(row + i), pcks[i].data);
> + dvbcsa_copy_32((uint8_t *)(row + i + 32), pcks[i].data + 4);
> + }
> +
> + dvbcsa_bs_matrix_transpose_64x32(row);
> +}
> +
> +void dvbcsa_bs_stream_transpose_out(const struct dvbcsa_bs_batch_s *pcks,
> + unsigned int index, dvbcsa_bs_word_t *row)
> +{
> + int i, j;
> + uint8_t *p1, *p2;
> +
> + dvbcsa_bs_matrix_transpose_64x32(row);
> +
> + p1 = (uint8_t *)row;
> + p2 = (uint8_t *)(row + BS_BATCH_SIZE);
> + for (; pcks->data; pcks++, p1 += 4, p2 += 4)
> + {
> + if (index + 4 <= pcks->len)
> + {
> + dvbcsa_xor_32(pcks->data + index, p1);
> + }
> + else
> + {
> + for (j = 0, i = index; i < pcks->len; i++, j++)
> + pcks->data[i] ^= p1[j];
> + continue;
> + }
> + if (index + 8 <= pcks->len)
> + {
> + dvbcsa_xor_32(pcks->data + index + 4, p2);
> + }
> + else
> + {
> + for (j = 0, i = index + 4; i < pcks->len; i++, j++)
> + pcks->data[i] ^= p2[j];
> + continue;
> + }
> + }
> +
> +}
> +
> diff --git a/src/dvbcsa_pv.h b/src/dvbcsa_pv.h
> index d92bc98..d0ce8eb 100644
> --- a/src/dvbcsa_pv.h
> +++ b/src/dvbcsa_pv.h
> @@ -83,12 +83,26 @@ void dvbcsa_stream_xor (const dvbcsa_cw_t cw, const dvbcsa_block_t iv,
>
> void dvbcsa_key_schedule_block(const dvbcsa_cw_t cw, uint8_t * kk);
>
> +/* target support for 32 and 64 bit unaligned memory access */
> +
> +#if defined(__i386__) || defined(__x86_64__)
> +#define DVBCSA_UNALIGNED_ACCESS_32 1
> +#define DVBCSA_UNALIGNED_ACCESS_64 1
> +#endif
> +
> +#if defined(__arm__) && defined(__ARM_FEATURE_UNALIGNED)
> +/* only 32 bit unaligned access is allowed for armv6, armv7, armv8 */
> +#define DVBCSA_UNALIGNED_ACCESS_32 1
> +#endif
> +
> DVBCSA_INLINE static inline void
> dvbcsa_xor_64 (uint8_t *b, const uint8_t *a)
> {
> -#if defined(__i386__) || defined(__x86_64__)
> - /* target support non aligned memory access */
> +#if defined(DVBCSA_UNALIGNED_ACCESS_64)
> *(uint64_t*)b ^= *(uint64_t*)a;
> +#elif defined(DVBCSA_UNALIGNED_ACCESS_32)
> + ((uint32_t *)b)[0] ^= ((uint32_t *)a)[0];
> + ((uint32_t *)b)[1] ^= ((uint32_t *)a)[1];
> #else
> unsigned int i;
>
> @@ -97,11 +111,55 @@ dvbcsa_xor_64 (uint8_t *b, const uint8_t *a)
> #endif
> }
>
> +DVBCSA_INLINE static inline void
> +dvbcsa_xor_32 (uint8_t *b, const uint8_t *a)
> +{
> +#ifdef DVBCSA_UNALIGNED_ACCESS_32
> + /* target supports non aligned memory access */
> + *(uint32_t*)b ^= *(uint32_t*)a;
> +#else
> + unsigned int i;
> +
> + for (i = 0; i < 4; i++)
> + b[i] ^= a[i];
> +#endif
> +}
> +
> +DVBCSA_INLINE static inline void
> +dvbcsa_copy_64 (uint8_t *b, const uint8_t *a)
> +{
> +#if defined(DVBCSA_UNALIGNED_ACCESS_64)
> + *(uint64_t*)b = *(uint64_t*)a;
> +#elif defined(DVBCSA_UNALIGNED_ACCESS_32)
> + ((uint32_t *)b)[0] = ((uint32_t *)a)[0];
> + ((uint32_t *)b)[1] = ((uint32_t *)a)[1];
> +#else
> + unsigned int i;
> +
> + for (i = 0; i < 8; i++)
> + b[i] = a[i];
> +#endif
> +}
> +
> +DVBCSA_INLINE static inline void
> +dvbcsa_copy_32 (uint8_t *b, const uint8_t *a)
> +{
> +#ifdef DVBCSA_UNALIGNED_ACCESS_32
> + /* target supports non aligned memory access */
> + *(uint32_t*)b = *(uint32_t*)a;
> +#else
> + unsigned int i;
> +
> + for (i = 0; i < 4; i++)
> + b[i] = a[i];
> +#endif
> +}
> +
> DVBCSA_INLINE static inline uint32_t
> dvbcsa_load_le32(const uint8_t *p)
> {
> -#if defined(__i386__) || defined(__x86_64__)
> - /* target support non aligned le memory access */
> +#if defined(DVBCSA_UNALIGNED_ACCESS_32) && defined(DVBCSA_ENDIAN_LITTLE)
> + /* target supports non aligned le memory access */
> return *(uint32_t*)p;
> #else
> return ((uint32_t)p[3] << 24) |
> @@ -114,8 +172,8 @@ dvbcsa_load_le32(const uint8_t *p)
> DVBCSA_INLINE static inline uint64_t
> dvbcsa_load_le64(const uint8_t *p)
> {
> -#if defined(__i386__) || defined(__x86_64__)
> - /* target support non aligned le memory access */
> +#if defined(DVBCSA_UNALIGNED_ACCESS_64) && defined(DVBCSA_ENDIAN_LITTLE)
> + /* target supports non aligned le memory access */
> return *(uint64_t*)p;
> #else
> return (uint64_t)( ((uint64_t)p[7] << 56) |
> @@ -133,8 +191,8 @@ dvbcsa_load_le64(const uint8_t *p)
> DVBCSA_INLINE static inline void
> dvbcsa_store_le32(uint8_t *p, const uint32_t w)
> {
> -#if defined(__i386__) || defined(__x86_64__)
> - /* target support non aligned le memory access */
> +#if defined(DVBCSA_UNALIGNED_ACCESS_32) && defined(DVBCSA_ENDIAN_LITTLE)
> + /* target supports non aligned le memory access */
> *(uint32_t*)p = w;
> #else
> p[3] = (w >> 24);
> @@ -147,8 +205,8 @@ dvbcsa_store_le32(uint8_t *p, const uint32_t w)
> DVBCSA_INLINE static inline void
> dvbcsa_store_le64(uint8_t *p, const uint64_t w)
> {
> -#if defined(__i386__) || defined(__x86_64__)
> - /* target support non aligned le memory access */
> +#if defined(DVBCSA_UNALIGNED_ACCESS_64) && defined(DVBCSA_ENDIAN_LITTLE)
> + /* target supports non aligned le memory access */
> *(uint64_t*)p = w;
> #else
> p[7] = (w >> 56);
> --
> 1.9.1
>
More information about the vlc-devel
mailing list