Commit 4460a860 authored by J.R. Mauro's avatar J.R. Mauro Committed by Greg Kroah-Hartman

Staging: Lindent the echo driver

Lindent drivers/staging/echo*

Signed-off by: J.R. Mauro <jrm8005@gmail.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@suse.de>
parent 786ed801
......@@ -36,14 +36,15 @@
\return The bit number of the highest set bit, or -1 if the word is zero. */
static __inline__ int top_bit(unsigned int bits)
{
int res;
__asm__ (" xorl %[res],%[res];\n"
" decl %[res];\n"
" bsrl %[bits],%[res]\n"
: [res] "=&r" (res)
: [bits] "rm" (bits));
return res;
int res;
__asm__(" xorl %[res],%[res];\n"
" decl %[res];\n"
" bsrl %[bits],%[res]\n"
:[res] "=&r" (res)
:[bits] "rm"(bits)
);
return res;
}
/*! \brief Find the bit position of the lowest set bit in a word
......@@ -51,84 +52,75 @@ static __inline__ int top_bit(unsigned int bits)
\return The bit number of the lowest set bit, or -1 if the word is zero. */
static __inline__ int bottom_bit(unsigned int bits)
{
int res;
__asm__ (" xorl %[res],%[res];\n"
" decl %[res];\n"
" bsfl %[bits],%[res]\n"
: [res] "=&r" (res)
: [bits] "rm" (bits));
return res;
int res;
__asm__(" xorl %[res],%[res];\n"
" decl %[res];\n"
" bsfl %[bits],%[res]\n"
:[res] "=&r" (res)
:[bits] "rm"(bits)
);
return res;
}
#else
static __inline__ int top_bit(unsigned int bits)
{
int i;
if (bits == 0)
return -1;
i = 0;
if (bits & 0xFFFF0000)
{
bits &= 0xFFFF0000;
i += 16;
}
if (bits & 0xFF00FF00)
{
bits &= 0xFF00FF00;
i += 8;
}
if (bits & 0xF0F0F0F0)
{
bits &= 0xF0F0F0F0;
i += 4;
}
if (bits & 0xCCCCCCCC)
{
bits &= 0xCCCCCCCC;
i += 2;
}
if (bits & 0xAAAAAAAA)
{
bits &= 0xAAAAAAAA;
i += 1;
}
return i;
int i;
if (bits == 0)
return -1;
i = 0;
if (bits & 0xFFFF0000) {
bits &= 0xFFFF0000;
i += 16;
}
if (bits & 0xFF00FF00) {
bits &= 0xFF00FF00;
i += 8;
}
if (bits & 0xF0F0F0F0) {
bits &= 0xF0F0F0F0;
i += 4;
}
if (bits & 0xCCCCCCCC) {
bits &= 0xCCCCCCCC;
i += 2;
}
if (bits & 0xAAAAAAAA) {
bits &= 0xAAAAAAAA;
i += 1;
}
return i;
}
static __inline__ int bottom_bit(unsigned int bits)
{
int i;
if (bits == 0)
return -1;
i = 32;
if (bits & 0x0000FFFF)
{
bits &= 0x0000FFFF;
i -= 16;
}
if (bits & 0x00FF00FF)
{
bits &= 0x00FF00FF;
i -= 8;
}
if (bits & 0x0F0F0F0F)
{
bits &= 0x0F0F0F0F;
i -= 4;
}
if (bits & 0x33333333)
{
bits &= 0x33333333;
i -= 2;
}
if (bits & 0x55555555)
{
bits &= 0x55555555;
i -= 1;
}
return i;
int i;
if (bits == 0)
return -1;
i = 32;
if (bits & 0x0000FFFF) {
bits &= 0x0000FFFF;
i -= 16;
}
if (bits & 0x00FF00FF) {
bits &= 0x00FF00FF;
i -= 8;
}
if (bits & 0x0F0F0F0F) {
bits &= 0x0F0F0F0F;
i -= 4;
}
if (bits & 0x33333333) {
bits &= 0x33333333;
i -= 2;
}
if (bits & 0x55555555) {
bits &= 0x55555555;
i -= 1;
}
return i;
}
#endif
......@@ -138,13 +130,14 @@ static __inline__ int bottom_bit(unsigned int bits)
static __inline__ uint8_t bit_reverse8(uint8_t x)
{
#if defined(__i386__) || defined(__x86_64__)
/* If multiply is fast */
return ((x*0x0802U & 0x22110U) | (x*0x8020U & 0x88440U))*0x10101U >> 16;
/* If multiply is fast */
return ((x * 0x0802U & 0x22110U) | (x * 0x8020U & 0x88440U)) *
0x10101U >> 16;
#else
/* If multiply is slow, but we have a barrel shifter */
x = (x >> 4) | (x << 4);
x = ((x & 0xCC) >> 2) | ((x & 0x33) << 2);
return ((x & 0xAA) >> 1) | ((x & 0x55) << 1);
/* If multiply is slow, but we have a barrel shifter */
x = (x >> 4) | (x << 4);
x = ((x & 0xCC) >> 2) | ((x & 0x33) << 2);
return ((x & 0xAA) >> 1) | ((x & 0x55) << 1);
#endif
}
......@@ -184,7 +177,7 @@ uint16_t make_mask16(uint16_t x);
\return The word with the single set bit. */
static __inline__ uint32_t least_significant_one32(uint32_t x)
{
return (x & (-(int32_t) x));
return (x & (-(int32_t) x));
}
/*! \brief Find the most significant one in a word, and return a word
......@@ -194,10 +187,10 @@ static __inline__ uint32_t least_significant_one32(uint32_t x)
static __inline__ uint32_t most_significant_one32(uint32_t x)
{
#if defined(__i386__) || defined(__x86_64__)
return 1 << top_bit(x);
return 1 << top_bit(x);
#else
x = make_mask32(x);
return (x ^ (x >> 1));
x = make_mask32(x);
return (x ^ (x >> 1));
#endif
}
......@@ -206,8 +199,8 @@ static __inline__ uint32_t most_significant_one32(uint32_t x)
\return 1 for odd, or 0 for even. */
static __inline__ int parity8(uint8_t x)
{
x = (x ^ (x >> 4)) & 0x0F;
return (0x6996 >> x) & 1;
x = (x ^ (x >> 4)) & 0x0F;
return (0x6996 >> x) & 1;
}
/*! \brief Find the parity of a 16 bit word.
......@@ -215,9 +208,9 @@ static __inline__ int parity8(uint8_t x)
\return 1 for odd, or 0 for even. */
static __inline__ int parity16(uint16_t x)
{
x ^= (x >> 8);
x = (x ^ (x >> 4)) & 0x0F;
return (0x6996 >> x) & 1;
x ^= (x >> 8);
x = (x ^ (x >> 4)) & 0x0F;
return (0x6996 >> x) & 1;
}
/*! \brief Find the parity of a 32 bit word.
......@@ -225,10 +218,10 @@ static __inline__ int parity16(uint16_t x)
\return 1 for odd, or 0 for even. */
static __inline__ int parity32(uint32_t x)
{
x ^= (x >> 16);
x ^= (x >> 8);
x = (x ^ (x >> 4)) & 0x0F;
return (0x6996 >> x) & 1;
x ^= (x >> 16);
x ^= (x >> 8);
x = (x ^ (x >> 4)) & 0x0F;
return (0x6996 >> x) & 1;
}
#endif
......
......@@ -74,7 +74,6 @@
Steve also has some nice notes on echo cancellers in echo.h
References:
[1] Ochiai, Areseki, and Ogihara, "Echo Canceller with Two Echo
......@@ -105,7 +104,7 @@
Mark, Pawel, and Pavel.
*/
#include <linux/kernel.h> /* We're doing kernel work */
#include <linux/kernel.h> /* We're doing kernel work */
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
......@@ -115,8 +114,8 @@
#define MIN_TX_POWER_FOR_ADAPTION 64
#define MIN_RX_POWER_FOR_ADAPTION 64
#define DTD_HANGOVER 600 /* 600 samples, or 75ms */
#define DC_LOG2BETA 3 /* log2() of DC filter Beta */
#define DTD_HANGOVER 600 /* 600 samples, or 75ms */
#define DC_LOG2BETA 3 /* log2() of DC filter Beta */
/*-----------------------------------------------------------------------*\
FUNCTIONS
......@@ -124,59 +123,58 @@
/* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */
#ifdef __bfin__
static void __inline__ lms_adapt_bg(struct oslec_state *ec, int clean, int shift)
static void __inline__ lms_adapt_bg(struct oslec_state *ec, int clean,
int shift)
{
int i, j;
int offset1;
int offset2;
int factor;
int exp;
int16_t *phist;
int n;
if (shift > 0)
factor = clean << shift;
else
factor = clean >> -shift;
/* Update the FIR taps */
offset2 = ec->curr_pos;
offset1 = ec->taps - offset2;
phist = &ec->fir_state_bg.history[offset2];
/* st: and en: help us locate the assembler in echo.s */
//asm("st:");
n = ec->taps;
for (i = 0, j = offset2; i < n; i++, j++)
{
exp = *phist++ * factor;
ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
}
//asm("en:");
/* Note the asm for the inner loop above generated by Blackfin gcc
4.1.1 is pretty good (note even parallel instructions used):
R0 = W [P0++] (X);
R0 *= R2;
R0 = R0 + R3 (NS) ||
R1 = W [P1] (X) ||
nop;
R0 >>>= 15;
R0 = R0 + R1;
W [P1++] = R0;
A block based update algorithm would be much faster but the
above can't be improved on much. Every instruction saved in
the loop above is 2 MIPs/ch! The for loop above is where the
Blackfin spends most of it's time - about 17 MIPs/ch measured
with speedtest.c with 256 taps (32ms). Write-back and
Write-through cache gave about the same performance.
*/
int i, j;
int offset1;
int offset2;
int factor;
int exp;
int16_t *phist;
int n;
if (shift > 0)
factor = clean << shift;
else
factor = clean >> -shift;
/* Update the FIR taps */
offset2 = ec->curr_pos;
offset1 = ec->taps - offset2;
phist = &ec->fir_state_bg.history[offset2];
/* st: and en: help us locate the assembler in echo.s */
//asm("st:");
n = ec->taps;
for (i = 0, j = offset2; i < n; i++, j++) {
exp = *phist++ * factor;
ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
}
//asm("en:");
/* Note the asm for the inner loop above generated by Blackfin gcc
4.1.1 is pretty good (note even parallel instructions used):
R0 = W [P0++] (X);
R0 *= R2;
R0 = R0 + R3 (NS) ||
R1 = W [P1] (X) ||
nop;
R0 >>>= 15;
R0 = R0 + R1;
W [P1++] = R0;
A block based update algorithm would be much faster but the
above can't be improved on much. Every instruction saved in
the loop above is 2 MIPs/ch! The for loop above is where the
Blackfin spends most of it's time - about 17 MIPs/ch measured
with speedtest.c with 256 taps (32ms). Write-back and
Write-through cache gave about the same performance.
*/
}
/*
......@@ -198,94 +196,90 @@ static void __inline__ lms_adapt_bg(struct oslec_state *ec, int clean, int shift
*/
#else
static __inline__ void lms_adapt_bg(struct oslec_state *ec, int clean, int shift)
static __inline__ void lms_adapt_bg(struct oslec_state *ec, int clean,
int shift)
{
int i;
int offset1;
int offset2;
int factor;
int exp;
if (shift > 0)
factor = clean << shift;
else
factor = clean >> -shift;
/* Update the FIR taps */
offset2 = ec->curr_pos;
offset1 = ec->taps - offset2;
for (i = ec->taps - 1; i >= offset1; i--)
{
exp = (ec->fir_state_bg.history[i - offset1]*factor);
ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
}
for ( ; i >= 0; i--)
{
exp = (ec->fir_state_bg.history[i + offset2]*factor);
ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
}
int i;
int offset1;
int offset2;
int factor;
int exp;
if (shift > 0)
factor = clean << shift;
else
factor = clean >> -shift;
/* Update the FIR taps */
offset2 = ec->curr_pos;
offset1 = ec->taps - offset2;
for (i = ec->taps - 1; i >= offset1; i--) {
exp = (ec->fir_state_bg.history[i - offset1] * factor);
ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
}
for (; i >= 0; i--) {
exp = (ec->fir_state_bg.history[i + offset2] * factor);
ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
}
}
#endif
struct oslec_state *oslec_create(int len, int adaption_mode)
{
struct oslec_state *ec;
int i;
ec = kzalloc(sizeof(*ec), GFP_KERNEL);
if (!ec)
return NULL;
ec->taps = len;
ec->log2taps = top_bit(len);
ec->curr_pos = ec->taps - 1;
for (i = 0; i < 2; i++) {
ec->fir_taps16[i] = kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
if (!ec->fir_taps16[i])
goto error_oom;
}
fir16_create(&ec->fir_state,
ec->fir_taps16[0],
ec->taps);
fir16_create(&ec->fir_state_bg,
ec->fir_taps16[1],
ec->taps);
for(i=0; i<5; i++) {
ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0;
}
ec->cng_level = 1000;
oslec_adaption_mode(ec, adaption_mode);
ec->snapshot = kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
if (!ec->snapshot)
goto error_oom;
ec->cond_met = 0;
ec->Pstates = 0;
ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
ec->Lbgn = ec->Lbgn_acc = 0;
ec->Lbgn_upper = 200;
ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
return ec;
error_oom:
for (i = 0; i < 2; i++)
kfree(ec->fir_taps16[i]);
kfree(ec);
return NULL;
struct oslec_state *ec;
int i;
ec = kzalloc(sizeof(*ec), GFP_KERNEL);
if (!ec)
return NULL;
ec->taps = len;
ec->log2taps = top_bit(len);
ec->curr_pos = ec->taps - 1;
for (i = 0; i < 2; i++) {
ec->fir_taps16[i] =
kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
if (!ec->fir_taps16[i])
goto error_oom;
}
fir16_create(&ec->fir_state, ec->fir_taps16[0], ec->taps);
fir16_create(&ec->fir_state_bg, ec->fir_taps16[1], ec->taps);
for (i = 0; i < 5; i++) {
ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0;
}
ec->cng_level = 1000;
oslec_adaption_mode(ec, adaption_mode);
ec->snapshot = kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
if (!ec->snapshot)
goto error_oom;
ec->cond_met = 0;
ec->Pstates = 0;
ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
ec->Lbgn = ec->Lbgn_acc = 0;
ec->Lbgn_upper = 200;
ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
return ec;
error_oom:
for (i = 0; i < 2; i++)
kfree(ec->fir_taps16[i]);
kfree(ec);
return NULL;
}
EXPORT_SYMBOL_GPL(oslec_create);
void oslec_free(struct oslec_state *ec)
......@@ -294,293 +288,300 @@ void oslec_free(struct oslec_state *ec)
fir16_free(&ec->fir_state);
fir16_free(&ec->fir_state_bg);
for (i = 0; i < 2; i++)
for (i = 0; i < 2; i++)
kfree(ec->fir_taps16[i]);
kfree(ec->snapshot);
kfree(ec);
}
EXPORT_SYMBOL_GPL(oslec_free);
void oslec_adaption_mode(struct oslec_state *ec, int adaption_mode)
{
ec->adaption_mode = adaption_mode;
ec->adaption_mode = adaption_mode;
}
EXPORT_SYMBOL_GPL(oslec_adaption_mode);
void oslec_flush(struct oslec_state *ec)
{
int i;
int i;
ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
ec->Lbgn = ec->Lbgn_acc = 0;
ec->Lbgn_upper = 200;
ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
ec->Lbgn = ec->Lbgn_acc = 0;
ec->Lbgn_upper = 200;
ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
ec->nonupdate_dwell = 0;
ec->nonupdate_dwell = 0;
fir16_flush(&ec->fir_state);
fir16_flush(&ec->fir_state_bg);
ec->fir_state.curr_pos = ec->taps - 1;
ec->fir_state_bg.curr_pos = ec->taps - 1;
for (i = 0; i < 2; i++)
memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t));
fir16_flush(&ec->fir_state);
fir16_flush(&ec->fir_state_bg);
ec->fir_state.curr_pos = ec->taps - 1;
ec->fir_state_bg.curr_pos = ec->taps - 1;
for (i = 0; i < 2; i++)
memset(ec->fir_taps16[i], 0, ec->taps * sizeof(int16_t));
ec->curr_pos = ec->taps - 1;
ec->Pstates = 0;
ec->curr_pos = ec->taps - 1;
ec->Pstates = 0;
}
EXPORT_SYMBOL_GPL(oslec_flush);
void oslec_snapshot(struct oslec_state *ec) {
memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps*sizeof(int16_t));
void oslec_snapshot(struct oslec_state *ec)
{
memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps * sizeof(int16_t));
}
EXPORT_SYMBOL_GPL(oslec_snapshot);
/* Dual Path Echo Canceller ------------------------------------------------*/
int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
{
int32_t echo_value;
int clean_bg;
int tmp, tmp1;
/* Input scaling was found be required to prevent problems when tx
starts clipping. Another possible way to handle this would be the
filter coefficent scaling. */
ec->tx = tx; ec->rx = rx;
tx >>=1;
rx >>=1;
/*
Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required
otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta)
only real axis. Some chip sets (like Si labs) don't need
this, but something like a $10 X100P card does. Any DC really slows
down convergence.
Note: removes some low frequency from the signal, this reduces
the speech quality when listening to samples through headphones
but may not be obvious through a telephone handset.
Note that the 3dB frequency in radians is approx Beta, e.g. for
Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz.
*/
if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) {
tmp = rx << 15;
int32_t echo_value;
int clean_bg;
int tmp, tmp1;
/* Input scaling was found be required to prevent problems when tx
starts clipping. Another possible way to handle this would be the
filter coefficent scaling. */
ec->tx = tx;
ec->rx = rx;
tx >>= 1;
rx >>= 1;
/*
Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required
otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta)
only real axis. Some chip sets (like Si labs) don't need
this, but something like a $10 X100P card does. Any DC really slows
down convergence.
Note: removes some low frequency from the signal, this reduces
the speech quality when listening to samples through headphones
but may not be obvious through a telephone handset.
Note that the 3dB frequency in radians is approx Beta, e.g. for
Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz.
*/
if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) {
tmp = rx << 15;
#if 1
/* Make sure the gain of the HPF is 1.0. This can still saturate a little under
impulse conditions, and it might roll to 32768 and need clipping on sustained peak
level signals. However, the scale of such clipping is small, and the error due to
any saturation should not markedly affect the downstream processing. */
tmp -= (tmp >> 4);
/* Make sure the gain of the HPF is 1.0. This can still saturate a little under
impulse conditions, and it might roll to 32768 and need clipping on sustained peak
level signals. However, the scale of such clipping is small, and the error due to
any saturation should not markedly affect the downstream processing. */
tmp -= (tmp >> 4);
#endif
ec->rx_1 += -(ec->rx_1>>DC_LOG2BETA) + tmp - ec->rx_2;
ec->rx_1 += -(ec->rx_1 >> DC_LOG2BETA) + tmp - ec->rx_2;
/* hard limit filter to prevent clipping. Note that at this stage
rx should be limited to +/- 16383 due to right shift above */
tmp1 = ec->rx_1 >> 15;
if (tmp1 > 16383)
tmp1 = 16383;
if (tmp1 < -16383)
tmp1 = -16383;
rx = tmp1;
ec->rx_2 = tmp;
}
/* hard limit filter to prevent clipping. Note that at this stage
rx should be limited to +/- 16383 due to right shift above */
tmp1 = ec->rx_1 >> 15;
if (tmp1 > 16383) tmp1 = 16383;
if (tmp1 < -16383) tmp1 = -16383;
rx = tmp1;
ec->rx_2 = tmp;
}
/* Block average of power in the filter states. Used for
adaption power calculation. */
/* Block average of power in the filter states. Used for
adaption power calculation. */
{
int new, old;
/* efficient "out with the old and in with the new" algorithm so
we don't have to recalculate over the whole block of
samples. */
new = (int)tx *(int)tx;
old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
(int)ec->fir_state.history[ec->fir_state.curr_pos];
ec->Pstates +=
((new - old) + (1 << ec->log2taps)) >> ec->log2taps;
if (ec->Pstates < 0)
ec->Pstates = 0;
}
{
int new, old;
/* Calculate short term average levels using simple single pole IIRs */
/* efficient "out with the old and in with the new" algorithm so
we don't have to recalculate over the whole block of
samples. */
new = (int)tx * (int)tx;
old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
(int)ec->fir_state.history[ec->fir_state.curr_pos];
ec->Pstates += ((new - old) + (1<<ec->log2taps)) >> ec->log2taps;
if (ec->Pstates < 0) ec->Pstates = 0;
}
/* Calculate short term average levels using simple single pole IIRs */
ec->Ltxacc += abs(tx) - ec->Ltx;
ec->Ltx = (ec->Ltxacc + (1<<4)) >> 5;
ec->Lrxacc += abs(rx) - ec->Lrx;
ec->Lrx = (ec->Lrxacc + (1<<4)) >> 5;
/* Foreground filter ---------------------------------------------------*/
ec->fir_state.coeffs = ec->fir_taps16[0];
echo_value = fir16(&ec->fir_state, tx);
ec->clean = rx - echo_value;
ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
ec->Lclean = (ec->Lcleanacc + (1<<4)) >> 5;
/* Background filter ---------------------------------------------------*/
echo_value = fir16(&ec->fir_state_bg, tx);
clean_bg = rx - echo_value;
ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
ec->Lclean_bg = (ec->Lclean_bgacc + (1<<4)) >> 5;
/* Background Filter adaption -----------------------------------------*/
/* Almost always adap bg filter, just simple DT and energy
detection to minimise adaption in cases of strong double talk.
However this is not critical for the dual path algorithm.
*/
ec->factor = 0;
ec->shift = 0;
if ((ec->nonupdate_dwell == 0)) {
int P, logP, shift;
/* Determine:
f = Beta * clean_bg_rx/P ------ (1)
where P is the total power in the filter states.
The Boffins have shown that if we obey (1) we converge
quickly and avoid instability.
The correct factor f must be in Q30, as this is the fixed
point format required by the lms_adapt_bg() function,
therefore the scaled version of (1) is:
(2^30) * f = (2^30) * Beta * clean_bg_rx/P
factor = (2^30) * Beta * clean_bg_rx/P ----- (2)
We have chosen Beta = 0.25 by experiment, so:
factor = (2^30) * (2^-2) * clean_bg_rx/P
(30 - 2 - log2(P))
factor = clean_bg_rx 2 ----- (3)
To avoid a divide we approximate log2(P) as top_bit(P),
which returns the position of the highest non-zero bit in
P. This approximation introduces an error as large as a
factor of 2, but the algorithm seems to handle it OK.
Come to think of it a divide may not be a big deal on a
modern DSP, so its probably worth checking out the cycles
for a divide versus a top_bit() implementation.
*/
P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates;
logP = top_bit(P) + ec->log2taps;
shift = 30 - 2 - logP;
ec->shift = shift;
lms_adapt_bg(ec, clean_bg, shift);
}
/* very simple DTD to make sure we dont try and adapt with strong
near end speech */
ec->adapt = 0;
if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
ec->nonupdate_dwell = DTD_HANGOVER;
if (ec->nonupdate_dwell)
ec->nonupdate_dwell--;
ec->Ltxacc += abs(tx) - ec->Ltx;
ec->Ltx = (ec->Ltxacc + (1 << 4)) >> 5;
ec->Lrxacc += abs(rx) - ec->Lrx;
ec->Lrx = (ec->Lrxacc + (1 << 4)) >> 5;
/* Transfer logic ------------------------------------------------------*/
/* Foreground filter --------------------------------------------------- */
/* These conditions are from the dual path paper [1], I messed with
them a bit to improve performance. */
ec->fir_state.coeffs = ec->fir_taps16[0];
echo_value = fir16(&ec->fir_state, tx);
ec->clean = rx - echo_value;
ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
ec->Lclean = (ec->Lcleanacc + (1 << 4)) >> 5;
if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
(ec->nonupdate_dwell == 0) &&
(8*ec->Lclean_bg < 7*ec->Lclean) /* (ec->Lclean_bg < 0.875*ec->Lclean) */ &&
(8*ec->Lclean_bg < ec->Ltx) /* (ec->Lclean_bg < 0.125*ec->Ltx) */ )
{
if (ec->cond_met == 6) {
/* BG filter has had better results for 6 consecutive samples */
ec->adapt = 1;
memcpy(ec->fir_taps16[0], ec->fir_taps16[1], ec->taps*sizeof(int16_t));
}
else
ec->cond_met++;
}
else
ec->cond_met = 0;
/* Background filter --------------------------------------------------- */
/* Non-Linear Processing ---------------------------------------------------*/
echo_value = fir16(&ec->fir_state_bg, tx);
clean_bg = rx - echo_value;
ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
ec->Lclean_bg = (ec->Lclean_bgacc + (1 << 4)) >> 5;
ec->clean_nlp = ec->clean;
if (ec->adaption_mode & ECHO_CAN_USE_NLP)
{
/* Non-linear processor - a fancy way to say "zap small signals, to avoid
residual echo due to (uLaw/ALaw) non-linearity in the channel.". */
/* Background Filter adaption ----------------------------------------- */
if ((16*ec->Lclean < ec->Ltx))
{
/* Our e/c has improved echo by at least 24 dB (each factor of 2 is 6dB,
so 2*2*2*2=16 is the same as 6+6+6+6=24dB) */
if (ec->adaption_mode & ECHO_CAN_USE_CNG)
{
ec->cng_level = ec->Lbgn;
/* Very elementary comfort noise generation. Just random
numbers rolled off very vaguely Hoth-like. DR: This
noise doesn't sound quite right to me - I suspect there
are some overlfow issues in the filtering as it's too
"crackly". TODO: debug this, maybe just play noise at
high level or look at spectrum.
*/
ec->cng_rndnum = 1664525U*ec->cng_rndnum + 1013904223U;
ec->cng_filter = ((ec->cng_rndnum & 0xFFFF) - 32768 + 5*ec->cng_filter) >> 3;
ec->clean_nlp = (ec->cng_filter*ec->cng_level*8) >> 14;
}
else if (ec->adaption_mode & ECHO_CAN_USE_CLIP)
{
/* This sounds much better than CNG */
if (ec->clean_nlp > ec->Lbgn)
ec->clean_nlp = ec->Lbgn;
if (ec->clean_nlp < -ec->Lbgn)
ec->clean_nlp = -ec->Lbgn;
/* Almost always adap bg filter, just simple DT and energy
detection to minimise adaption in cases of strong double talk.
However this is not critical for the dual path algorithm.
*/
ec->factor = 0;
ec->shift = 0;
if ((ec->nonupdate_dwell == 0)) {
int P, logP, shift;
/* Determine:
f = Beta * clean_bg_rx/P ------ (1)
where P is the total power in the filter states.
The Boffins have shown that if we obey (1) we converge
quickly and avoid instability.
The correct factor f must be in Q30, as this is the fixed
point format required by the lms_adapt_bg() function,
therefore the scaled version of (1) is:
(2^30) * f = (2^30) * Beta * clean_bg_rx/P
factor = (2^30) * Beta * clean_bg_rx/P ----- (2)
We have chosen Beta = 0.25 by experiment, so:
factor = (2^30) * (2^-2) * clean_bg_rx/P
(30 - 2 - log2(P))
factor = clean_bg_rx 2 ----- (3)
To avoid a divide we approximate log2(P) as top_bit(P),
which returns the position of the highest non-zero bit in
P. This approximation introduces an error as large as a
factor of 2, but the algorithm seems to handle it OK.
Come to think of it a divide may not be a big deal on a
modern DSP, so its probably worth checking out the cycles
for a divide versus a top_bit() implementation.
*/
P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates;
logP = top_bit(P) + ec->log2taps;
shift = 30 - 2 - logP;
ec->shift = shift;
lms_adapt_bg(ec, clean_bg, shift);
}
else
{
/* just mute the residual, doesn't sound very good, used mainly
in G168 tests */
ec->clean_nlp = 0;
}
}
else {
/* Background noise estimator. I tried a few algorithms
here without much luck. This very simple one seems to
work best, we just average the level using a slow (1 sec
time const) filter if the current level is less than a
(experimentally derived) constant. This means we dont
include high level signals like near end speech. When
combined with CNG or especially CLIP seems to work OK.
*/
if (ec->Lclean < 40) {
ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn;
ec->Lbgn = (ec->Lbgn_acc + (1<<11)) >> 12;
}
}
}
/* Roll around the taps buffer */
if (ec->curr_pos <= 0)
ec->curr_pos = ec->taps;
ec->curr_pos--;
if (ec->adaption_mode & ECHO_CAN_DISABLE)
ec->clean_nlp = rx;
/* Output scaled back up again to match input scaling */
return (int16_t) ec->clean_nlp << 1;
/* very simple DTD to make sure we dont try and adapt with strong
near end speech */
ec->adapt = 0;
if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
ec->nonupdate_dwell = DTD_HANGOVER;
if (ec->nonupdate_dwell)
ec->nonupdate_dwell--;
/* Transfer logic ------------------------------------------------------ */
/* These conditions are from the dual path paper [1], I messed with
them a bit to improve performance. */
if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
(ec->nonupdate_dwell == 0) &&
(8 * ec->Lclean_bg <
7 * ec->Lclean) /* (ec->Lclean_bg < 0.875*ec->Lclean) */ &&
(8 * ec->Lclean_bg <
ec->Ltx) /* (ec->Lclean_bg < 0.125*ec->Ltx) */ ) {
if (ec->cond_met == 6) {
/* BG filter has had better results for 6 consecutive samples */
ec->adapt = 1;
memcpy(ec->fir_taps16[0], ec->fir_taps16[1],
ec->taps * sizeof(int16_t));
} else
ec->cond_met++;
} else
ec->cond_met = 0;
/* Non-Linear Processing --------------------------------------------------- */
ec->clean_nlp = ec->clean;
if (ec->adaption_mode & ECHO_CAN_USE_NLP) {
/* Non-linear processor - a fancy way to say "zap small signals, to avoid
residual echo due to (uLaw/ALaw) non-linearity in the channel.". */
if ((16 * ec->Lclean < ec->Ltx)) {
/* Our e/c has improved echo by at least 24 dB (each factor of 2 is 6dB,
so 2*2*2*2=16 is the same as 6+6+6+6=24dB) */
if (ec->adaption_mode & ECHO_CAN_USE_CNG) {
ec->cng_level = ec->Lbgn;
/* Very elementary comfort noise generation. Just random
numbers rolled off very vaguely Hoth-like. DR: This
noise doesn't sound quite right to me - I suspect there
are some overlfow issues in the filtering as it's too
"crackly". TODO: debug this, maybe just play noise at
high level or look at spectrum.
*/
ec->cng_rndnum =
1664525U * ec->cng_rndnum + 1013904223U;
ec->cng_filter =
((ec->cng_rndnum & 0xFFFF) - 32768 +
5 * ec->cng_filter) >> 3;
ec->clean_nlp =
(ec->cng_filter * ec->cng_level * 8) >> 14;
} else if (ec->adaption_mode & ECHO_CAN_USE_CLIP) {
/* This sounds much better than CNG */
if (ec->clean_nlp > ec->Lbgn)
ec->clean_nlp = ec->Lbgn;
if (ec->clean_nlp < -ec->Lbgn)
ec->clean_nlp = -ec->Lbgn;
} else {
/* just mute the residual, doesn't sound very good, used mainly
in G168 tests */
ec->clean_nlp = 0;
}
} else {
/* Background noise estimator. I tried a few algorithms
here without much luck. This very simple one seems to
work best, we just average the level using a slow (1 sec
time const) filter if the current level is less than a
(experimentally derived) constant. This means we dont
include high level signals like near end speech. When
combined with CNG or especially CLIP seems to work OK.
*/
if (ec->Lclean < 40) {
ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn;
ec->Lbgn = (ec->Lbgn_acc + (1 << 11)) >> 12;
}
}
}
/* Roll around the taps buffer */
if (ec->curr_pos <= 0)
ec->curr_pos = ec->taps;
ec->curr_pos--;
if (ec->adaption_mode & ECHO_CAN_DISABLE)
ec->clean_nlp = rx;
/* Output scaled back up again to match input scaling */
return (int16_t) ec->clean_nlp << 1;
}
EXPORT_SYMBOL_GPL(oslec_update);
/* This function is seperated from the echo canceller is it is usually called
......@@ -604,28 +605,32 @@ EXPORT_SYMBOL_GPL(oslec_update);
precision, which noise shapes things, giving very clean DC removal.
*/
int16_t oslec_hpf_tx(struct oslec_state *ec, int16_t tx) {
int tmp, tmp1;
int16_t oslec_hpf_tx(struct oslec_state * ec, int16_t tx)
{
int tmp, tmp1;
if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) {
tmp = tx << 15;
if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) {
tmp = tx << 15;
#if 1
/* Make sure the gain of the HPF is 1.0. The first can still saturate a little under
impulse conditions, and it might roll to 32768 and need clipping on sustained peak
level signals. However, the scale of such clipping is small, and the error due to
any saturation should not markedly affect the downstream processing. */
tmp -= (tmp >> 4);
/* Make sure the gain of the HPF is 1.0. The first can still saturate a little under
impulse conditions, and it might roll to 32768 and need clipping on sustained peak
level signals. However, the scale of such clipping is small, and the error due to
any saturation should not markedly affect the downstream processing. */
tmp -= (tmp >> 4);
#endif
ec->tx_1 += -(ec->tx_1>>DC_LOG2BETA) + tmp - ec->tx_2;
tmp1 = ec->tx_1 >> 15;
if (tmp1 > 32767) tmp1 = 32767;
if (tmp1 < -32767) tmp1 = -32767;
tx = tmp1;
ec->tx_2 = tmp;
}
return tx;
ec->tx_1 += -(ec->tx_1 >> DC_LOG2BETA) + tmp - ec->tx_2;
tmp1 = ec->tx_1 >> 15;
if (tmp1 > 32767)
tmp1 = 32767;
if (tmp1 < -32767)
tmp1 = -32767;
tx = tmp1;
ec->tx_2 = tmp;
}
return tx;
}
EXPORT_SYMBOL_GPL(oslec_hpf_tx);
MODULE_LICENSE("GPL");
......
......@@ -124,9 +124,8 @@ a minor burden.
G.168 echo canceller descriptor. This defines the working state for a line
echo canceller.
*/
struct oslec_state
{
int16_t tx,rx;
struct oslec_state {
int16_t tx, rx;
int16_t clean;
int16_t clean_nlp;
......@@ -170,4 +169,4 @@ struct oslec_state
int16_t *snapshot;
};
#endif /* __ECHO_H */
#endif /* __ECHO_H */
......@@ -72,8 +72,7 @@
16 bit integer FIR descriptor. This defines the working state for a single
instance of an FIR filter using 16 bit integer coefficients.
*/
typedef struct
{
typedef struct {
int taps;
int curr_pos;
const int16_t *coeffs;
......@@ -85,8 +84,7 @@ typedef struct
instance of an FIR filter using 32 bit integer coefficients, and filtering
16 bit integer data.
*/
typedef struct
{
typedef struct {
int taps;
int curr_pos;
const int32_t *coeffs;
......@@ -97,39 +95,37 @@ typedef struct
Floating point FIR descriptor. This defines the working state for a single
instance of an FIR filter using floating point coefficients and data.
*/
typedef struct
{
typedef struct {
int taps;
int curr_pos;
const float *coeffs;
float *history;
} fir_float_state_t;
static __inline__ const int16_t *fir16_create(fir16_state_t *fir,
const int16_t *coeffs,
int taps)
static __inline__ const int16_t *fir16_create(fir16_state_t * fir,
const int16_t * coeffs, int taps)
{
fir->taps = taps;
fir->curr_pos = taps - 1;
fir->coeffs = coeffs;
#if defined(USE_MMX) || defined(USE_SSE2) || defined(__bfin__)
fir->history = kcalloc(2*taps, sizeof(int16_t), GFP_KERNEL);
fir->history = kcalloc(2 * taps, sizeof(int16_t), GFP_KERNEL);
#else
fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL);
#endif
return fir->history;
}
static __inline__ void fir16_flush(fir16_state_t *fir)
static __inline__ void fir16_flush(fir16_state_t * fir)
{
#if defined(USE_MMX) || defined(USE_SSE2) || defined(__bfin__)
memset(fir->history, 0, 2*fir->taps*sizeof(int16_t));
memset(fir->history, 0, 2 * fir->taps * sizeof(int16_t));
#else
memset(fir->history, 0, fir->taps*sizeof(int16_t));
memset(fir->history, 0, fir->taps * sizeof(int16_t));
#endif
}
static __inline__ void fir16_free(fir16_state_t *fir)
static __inline__ void fir16_free(fir16_state_t * fir)
{
kfree(fir->history);
}
......@@ -137,166 +133,162 @@ static __inline__ void fir16_free(fir16_state_t *fir)
#ifdef __bfin__
static inline int32_t dot_asm(short *x, short *y, int len)
{
int dot;
len--;
__asm__
(
"I0 = %1;\n\t"
"I1 = %2;\n\t"
"A0 = 0;\n\t"
"R0.L = W[I0++] || R1.L = W[I1++];\n\t"
"LOOP dot%= LC0 = %3;\n\t"
"LOOP_BEGIN dot%=;\n\t"
"A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t"
"LOOP_END dot%=;\n\t"
"A0 += R0.L*R1.L (IS);\n\t"
"R0 = A0;\n\t"
"%0 = R0;\n\t"
: "=&d" (dot)
: "a" (x), "a" (y), "a" (len)
: "I0", "I1", "A1", "A0", "R0", "R1"
);
return dot;
int dot;
len--;
__asm__("I0 = %1;\n\t"
"I1 = %2;\n\t"
"A0 = 0;\n\t"
"R0.L = W[I0++] || R1.L = W[I1++];\n\t"
"LOOP dot%= LC0 = %3;\n\t"
"LOOP_BEGIN dot%=;\n\t"
"A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t"
"LOOP_END dot%=;\n\t"
"A0 += R0.L*R1.L (IS);\n\t"
"R0 = A0;\n\t"
"%0 = R0;\n\t"
:"=&d"(dot)
:"a"(x), "a"(y), "a"(len)
:"I0", "I1", "A1", "A0", "R0", "R1"
);
return dot;
}
#endif
static __inline__ int16_t fir16(fir16_state_t *fir, int16_t sample)
static __inline__ int16_t fir16(fir16_state_t * fir, int16_t sample)
{
int32_t y;
int32_t y;
#if defined(USE_MMX)
int i;
mmx_t *mmx_coeffs;
mmx_t *mmx_hist;
fir->history[fir->curr_pos] = sample;
fir->history[fir->curr_pos + fir->taps] = sample;
mmx_coeffs = (mmx_t *) fir->coeffs;
mmx_hist = (mmx_t *) &fir->history[fir->curr_pos];
i = fir->taps;
pxor_r2r(mm4, mm4);
/* 8 samples per iteration, so the filter must be a multiple of 8 long. */
while (i > 0)
{
movq_m2r(mmx_coeffs[0], mm0);
movq_m2r(mmx_coeffs[1], mm2);
movq_m2r(mmx_hist[0], mm1);
movq_m2r(mmx_hist[1], mm3);
mmx_coeffs += 2;
mmx_hist += 2;
pmaddwd_r2r(mm1, mm0);
pmaddwd_r2r(mm3, mm2);
paddd_r2r(mm0, mm4);
paddd_r2r(mm2, mm4);
i -= 8;
}
movq_r2r(mm4, mm0);
psrlq_i2r(32, mm0);
paddd_r2r(mm0, mm4);
movd_r2m(mm4, y);
emms();
int i;
mmx_t *mmx_coeffs;
mmx_t *mmx_hist;
fir->history[fir->curr_pos] = sample;
fir->history[fir->curr_pos + fir->taps] = sample;
mmx_coeffs = (mmx_t *) fir->coeffs;
mmx_hist = (mmx_t *) & fir->history[fir->curr_pos];
i = fir->taps;
pxor_r2r(mm4, mm4);
/* 8 samples per iteration, so the filter must be a multiple of 8 long. */
while (i > 0) {
movq_m2r(mmx_coeffs[0], mm0);
movq_m2r(mmx_coeffs[1], mm2);
movq_m2r(mmx_hist[0], mm1);
movq_m2r(mmx_hist[1], mm3);
mmx_coeffs += 2;
mmx_hist += 2;
pmaddwd_r2r(mm1, mm0);
pmaddwd_r2r(mm3, mm2);
paddd_r2r(mm0, mm4);
paddd_r2r(mm2, mm4);
i -= 8;
}
movq_r2r(mm4, mm0);
psrlq_i2r(32, mm0);
paddd_r2r(mm0, mm4);
movd_r2m(mm4, y);
emms();
#elif defined(USE_SSE2)
int i;
xmm_t *xmm_coeffs;
xmm_t *xmm_hist;
fir->history[fir->curr_pos] = sample;
fir->history[fir->curr_pos + fir->taps] = sample;
xmm_coeffs = (xmm_t *) fir->coeffs;
xmm_hist = (xmm_t *) &fir->history[fir->curr_pos];
i = fir->taps;
pxor_r2r(xmm4, xmm4);
/* 16 samples per iteration, so the filter must be a multiple of 16 long. */
while (i > 0)
{
movdqu_m2r(xmm_coeffs[0], xmm0);
movdqu_m2r(xmm_coeffs[1], xmm2);
movdqu_m2r(xmm_hist[0], xmm1);
movdqu_m2r(xmm_hist[1], xmm3);
xmm_coeffs += 2;
xmm_hist += 2;
pmaddwd_r2r(xmm1, xmm0);
pmaddwd_r2r(xmm3, xmm2);
paddd_r2r(xmm0, xmm4);
paddd_r2r(xmm2, xmm4);
i -= 16;
}
movdqa_r2r(xmm4, xmm0);
psrldq_i2r(8, xmm0);
paddd_r2r(xmm0, xmm4);
movdqa_r2r(xmm4, xmm0);
psrldq_i2r(4, xmm0);
paddd_r2r(xmm0, xmm4);
movd_r2m(xmm4, y);
int i;
xmm_t *xmm_coeffs;
xmm_t *xmm_hist;
fir->history[fir->curr_pos] = sample;
fir->history[fir->curr_pos + fir->taps] = sample;
xmm_coeffs = (xmm_t *) fir->coeffs;
xmm_hist = (xmm_t *) & fir->history[fir->curr_pos];
i = fir->taps;
pxor_r2r(xmm4, xmm4);
/* 16 samples per iteration, so the filter must be a multiple of 16 long. */
while (i > 0) {
movdqu_m2r(xmm_coeffs[0], xmm0);
movdqu_m2r(xmm_coeffs[1], xmm2);
movdqu_m2r(xmm_hist[0], xmm1);
movdqu_m2r(xmm_hist[1], xmm3);
xmm_coeffs += 2;
xmm_hist += 2;
pmaddwd_r2r(xmm1, xmm0);
pmaddwd_r2r(xmm3, xmm2);
paddd_r2r(xmm0, xmm4);
paddd_r2r(xmm2, xmm4);
i -= 16;
}
movdqa_r2r(xmm4, xmm0);
psrldq_i2r(8, xmm0);
paddd_r2r(xmm0, xmm4);
movdqa_r2r(xmm4, xmm0);
psrldq_i2r(4, xmm0);
paddd_r2r(xmm0, xmm4);
movd_r2m(xmm4, y);
#elif defined(__bfin__)
fir->history[fir->curr_pos] = sample;
fir->history[fir->curr_pos + fir->taps] = sample;
y = dot_asm((int16_t*)fir->coeffs, &fir->history[fir->curr_pos], fir->taps);
fir->history[fir->curr_pos] = sample;
fir->history[fir->curr_pos + fir->taps] = sample;
y = dot_asm((int16_t *) fir->coeffs, &fir->history[fir->curr_pos],
fir->taps);
#else
int i;
int offset1;
int offset2;
fir->history[fir->curr_pos] = sample;
offset2 = fir->curr_pos;
offset1 = fir->taps - offset2;
y = 0;
for (i = fir->taps - 1; i >= offset1; i--)
y += fir->coeffs[i]*fir->history[i - offset1];
for ( ; i >= 0; i--)
y += fir->coeffs[i]*fir->history[i + offset2];
int i;
int offset1;
int offset2;
fir->history[fir->curr_pos] = sample;
offset2 = fir->curr_pos;
offset1 = fir->taps - offset2;
y = 0;
for (i = fir->taps - 1; i >= offset1; i--)
y += fir->coeffs[i] * fir->history[i - offset1];
for (; i >= 0; i--)
y += fir->coeffs[i] * fir->history[i + offset2];
#endif
if (fir->curr_pos <= 0)
fir->curr_pos = fir->taps;
fir->curr_pos--;
return (int16_t) (y >> 15);
if (fir->curr_pos <= 0)
fir->curr_pos = fir->taps;
fir->curr_pos--;
return (int16_t) (y >> 15);
}
static __inline__ const int16_t *fir32_create(fir32_state_t *fir,
const int32_t *coeffs,
int taps)
static __inline__ const int16_t *fir32_create(fir32_state_t * fir,
const int32_t * coeffs, int taps)
{
fir->taps = taps;
fir->curr_pos = taps - 1;
fir->coeffs = coeffs;
fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL);
return fir->history;
fir->taps = taps;
fir->curr_pos = taps - 1;
fir->coeffs = coeffs;
fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL);
return fir->history;
}
static __inline__ void fir32_flush(fir32_state_t *fir)
static __inline__ void fir32_flush(fir32_state_t * fir)
{
memset(fir->history, 0, fir->taps*sizeof(int16_t));
memset(fir->history, 0, fir->taps * sizeof(int16_t));
}
static __inline__ void fir32_free(fir32_state_t *fir)
static __inline__ void fir32_free(fir32_state_t * fir)
{
kfree(fir->history);
kfree(fir->history);
}
static __inline__ int16_t fir32(fir32_state_t *fir, int16_t sample)
static __inline__ int16_t fir32(fir32_state_t * fir, int16_t sample)
{
int i;
int32_t y;
int offset1;
int offset2;
fir->history[fir->curr_pos] = sample;
offset2 = fir->curr_pos;
offset1 = fir->taps - offset2;
y = 0;
for (i = fir->taps - 1; i >= offset1; i--)
y += fir->coeffs[i]*fir->history[i - offset1];
for ( ; i >= 0; i--)
y += fir->coeffs[i]*fir->history[i + offset2];
if (fir->curr_pos <= 0)
fir->curr_pos = fir->taps;
fir->curr_pos--;
return (int16_t) (y >> 15);
int i;
int32_t y;
int offset1;
int offset2;
fir->history[fir->curr_pos] = sample;
offset2 = fir->curr_pos;
offset1 = fir->taps - offset2;
y = 0;
for (i = fir->taps - 1; i >= offset1; i--)
y += fir->coeffs[i] * fir->history[i - offset1];
for (; i >= 0; i--)
y += fir->coeffs[i] * fir->history[i + offset2];
if (fir->curr_pos <= 0)
fir->curr_pos = fir->taps;
fir->curr_pos--;
return (int16_t) (y >> 15);
}
#endif
......
......@@ -27,24 +27,23 @@
* values by ULL, lest they be truncated by the compiler)
*/
typedef union {
long long q; /* Quadword (64-bit) value */
unsigned long long uq; /* Unsigned Quadword */
int d[2]; /* 2 Doubleword (32-bit) values */
unsigned int ud[2]; /* 2 Unsigned Doubleword */
short w[4]; /* 4 Word (16-bit) values */
unsigned short uw[4]; /* 4 Unsigned Word */
char b[8]; /* 8 Byte (8-bit) values */
unsigned char ub[8]; /* 8 Unsigned Byte */
float s[2]; /* Single-precision (32-bit) value */
} mmx_t; /* On an 8-byte (64-bit) boundary */
typedef union {
long long q; /* Quadword (64-bit) value */
unsigned long long uq; /* Unsigned Quadword */
int d[2]; /* 2 Doubleword (32-bit) values */
unsigned int ud[2]; /* 2 Unsigned Doubleword */
short w[4]; /* 4 Word (16-bit) values */
unsigned short uw[4]; /* 4 Unsigned Word */
char b[8]; /* 8 Byte (8-bit) values */
unsigned char ub[8]; /* 8 Unsigned Byte */
float s[2]; /* Single-precision (32-bit) value */
} mmx_t; /* On an 8-byte (64-bit) boundary */
/* SSE registers */
typedef union {
char b[16];
} xmm_t;
#define mmx_i2r(op,imm,reg) \
__asm__ __volatile__ (#op " %0, %%" #reg \
: /* nothing */ \
......@@ -63,7 +62,6 @@ typedef union {
#define mmx_r2r(op,regs,regd) \
__asm__ __volatile__ (#op " %" #regs ", %" #regd)
#define emms() __asm__ __volatile__ ("emms")
#define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
......@@ -192,16 +190,13 @@ typedef union {
#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
/* 3DNOW extensions */
#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg)
#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd)
/* AMD MMX extensions - also available in intel SSE */
#define mmx_m2ri(op,mem,reg,imm) \
__asm__ __volatile__ (#op " %1, %0, %%" #reg \
: /* nothing */ \
......@@ -216,7 +211,6 @@ typedef union {
: /* nothing */ \
: "m" (mem))
#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
......@@ -284,5 +278,4 @@ typedef union {
#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd)
#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd)
#endif /* AVCODEC_I386MMX_H */
......@@ -83,4 +83,4 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx);
*/
int16_t oslec_hpf_tx(struct oslec_state *ec, int16_t tx);
#endif /* __OSLEC_H */
#endif /* __OSLEC_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment