Commit 4460a860 authored by J.R. Mauro's avatar J.R. Mauro Committed by Greg Kroah-Hartman

Staging: Lindent the echo driver

Lindent drivers/staging/echo*

Signed-off by: J.R. Mauro <jrm8005@gmail.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@suse.de>
parent 786ed801
...@@ -36,14 +36,15 @@ ...@@ -36,14 +36,15 @@
\return The bit number of the highest set bit, or -1 if the word is zero. */ \return The bit number of the highest set bit, or -1 if the word is zero. */
static __inline__ int top_bit(unsigned int bits) static __inline__ int top_bit(unsigned int bits)
{ {
int res; int res;
__asm__ (" xorl %[res],%[res];\n" __asm__(" xorl %[res],%[res];\n"
" decl %[res];\n" " decl %[res];\n"
" bsrl %[bits],%[res]\n" " bsrl %[bits],%[res]\n"
: [res] "=&r" (res) :[res] "=&r" (res)
: [bits] "rm" (bits)); :[bits] "rm"(bits)
return res; );
return res;
} }
/*! \brief Find the bit position of the lowest set bit in a word /*! \brief Find the bit position of the lowest set bit in a word
...@@ -51,84 +52,75 @@ static __inline__ int top_bit(unsigned int bits) ...@@ -51,84 +52,75 @@ static __inline__ int top_bit(unsigned int bits)
\return The bit number of the lowest set bit, or -1 if the word is zero. */ \return The bit number of the lowest set bit, or -1 if the word is zero. */
static __inline__ int bottom_bit(unsigned int bits) static __inline__ int bottom_bit(unsigned int bits)
{ {
int res; int res;
__asm__ (" xorl %[res],%[res];\n" __asm__(" xorl %[res],%[res];\n"
" decl %[res];\n" " decl %[res];\n"
" bsfl %[bits],%[res]\n" " bsfl %[bits],%[res]\n"
: [res] "=&r" (res) :[res] "=&r" (res)
: [bits] "rm" (bits)); :[bits] "rm"(bits)
return res; );
return res;
} }
#else #else
static __inline__ int top_bit(unsigned int bits) static __inline__ int top_bit(unsigned int bits)
{ {
int i; int i;
if (bits == 0) if (bits == 0)
return -1; return -1;
i = 0; i = 0;
if (bits & 0xFFFF0000) if (bits & 0xFFFF0000) {
{ bits &= 0xFFFF0000;
bits &= 0xFFFF0000; i += 16;
i += 16; }
} if (bits & 0xFF00FF00) {
if (bits & 0xFF00FF00) bits &= 0xFF00FF00;
{ i += 8;
bits &= 0xFF00FF00; }
i += 8; if (bits & 0xF0F0F0F0) {
} bits &= 0xF0F0F0F0;
if (bits & 0xF0F0F0F0) i += 4;
{ }
bits &= 0xF0F0F0F0; if (bits & 0xCCCCCCCC) {
i += 4; bits &= 0xCCCCCCCC;
} i += 2;
if (bits & 0xCCCCCCCC) }
{ if (bits & 0xAAAAAAAA) {
bits &= 0xCCCCCCCC; bits &= 0xAAAAAAAA;
i += 2; i += 1;
} }
if (bits & 0xAAAAAAAA) return i;
{
bits &= 0xAAAAAAAA;
i += 1;
}
return i;
} }
static __inline__ int bottom_bit(unsigned int bits) static __inline__ int bottom_bit(unsigned int bits)
{ {
int i; int i;
if (bits == 0) if (bits == 0)
return -1; return -1;
i = 32; i = 32;
if (bits & 0x0000FFFF) if (bits & 0x0000FFFF) {
{ bits &= 0x0000FFFF;
bits &= 0x0000FFFF; i -= 16;
i -= 16; }
} if (bits & 0x00FF00FF) {
if (bits & 0x00FF00FF) bits &= 0x00FF00FF;
{ i -= 8;
bits &= 0x00FF00FF; }
i -= 8; if (bits & 0x0F0F0F0F) {
} bits &= 0x0F0F0F0F;
if (bits & 0x0F0F0F0F) i -= 4;
{ }
bits &= 0x0F0F0F0F; if (bits & 0x33333333) {
i -= 4; bits &= 0x33333333;
} i -= 2;
if (bits & 0x33333333) }
{ if (bits & 0x55555555) {
bits &= 0x33333333; bits &= 0x55555555;
i -= 2; i -= 1;
} }
if (bits & 0x55555555) return i;
{
bits &= 0x55555555;
i -= 1;
}
return i;
} }
#endif #endif
...@@ -138,13 +130,14 @@ static __inline__ int bottom_bit(unsigned int bits) ...@@ -138,13 +130,14 @@ static __inline__ int bottom_bit(unsigned int bits)
static __inline__ uint8_t bit_reverse8(uint8_t x) static __inline__ uint8_t bit_reverse8(uint8_t x)
{ {
#if defined(__i386__) || defined(__x86_64__) #if defined(__i386__) || defined(__x86_64__)
/* If multiply is fast */ /* If multiply is fast */
return ((x*0x0802U & 0x22110U) | (x*0x8020U & 0x88440U))*0x10101U >> 16; return ((x * 0x0802U & 0x22110U) | (x * 0x8020U & 0x88440U)) *
0x10101U >> 16;
#else #else
/* If multiply is slow, but we have a barrel shifter */ /* If multiply is slow, but we have a barrel shifter */
x = (x >> 4) | (x << 4); x = (x >> 4) | (x << 4);
x = ((x & 0xCC) >> 2) | ((x & 0x33) << 2); x = ((x & 0xCC) >> 2) | ((x & 0x33) << 2);
return ((x & 0xAA) >> 1) | ((x & 0x55) << 1); return ((x & 0xAA) >> 1) | ((x & 0x55) << 1);
#endif #endif
} }
...@@ -184,7 +177,7 @@ uint16_t make_mask16(uint16_t x); ...@@ -184,7 +177,7 @@ uint16_t make_mask16(uint16_t x);
\return The word with the single set bit. */ \return The word with the single set bit. */
static __inline__ uint32_t least_significant_one32(uint32_t x) static __inline__ uint32_t least_significant_one32(uint32_t x)
{ {
return (x & (-(int32_t) x)); return (x & (-(int32_t) x));
} }
/*! \brief Find the most significant one in a word, and return a word /*! \brief Find the most significant one in a word, and return a word
...@@ -194,10 +187,10 @@ static __inline__ uint32_t least_significant_one32(uint32_t x) ...@@ -194,10 +187,10 @@ static __inline__ uint32_t least_significant_one32(uint32_t x)
static __inline__ uint32_t most_significant_one32(uint32_t x) static __inline__ uint32_t most_significant_one32(uint32_t x)
{ {
#if defined(__i386__) || defined(__x86_64__) #if defined(__i386__) || defined(__x86_64__)
return 1 << top_bit(x); return 1 << top_bit(x);
#else #else
x = make_mask32(x); x = make_mask32(x);
return (x ^ (x >> 1)); return (x ^ (x >> 1));
#endif #endif
} }
...@@ -206,8 +199,8 @@ static __inline__ uint32_t most_significant_one32(uint32_t x) ...@@ -206,8 +199,8 @@ static __inline__ uint32_t most_significant_one32(uint32_t x)
\return 1 for odd, or 0 for even. */ \return 1 for odd, or 0 for even. */
static __inline__ int parity8(uint8_t x) static __inline__ int parity8(uint8_t x)
{ {
x = (x ^ (x >> 4)) & 0x0F; x = (x ^ (x >> 4)) & 0x0F;
return (0x6996 >> x) & 1; return (0x6996 >> x) & 1;
} }
/*! \brief Find the parity of a 16 bit word. /*! \brief Find the parity of a 16 bit word.
...@@ -215,9 +208,9 @@ static __inline__ int parity8(uint8_t x) ...@@ -215,9 +208,9 @@ static __inline__ int parity8(uint8_t x)
\return 1 for odd, or 0 for even. */ \return 1 for odd, or 0 for even. */
static __inline__ int parity16(uint16_t x) static __inline__ int parity16(uint16_t x)
{ {
x ^= (x >> 8); x ^= (x >> 8);
x = (x ^ (x >> 4)) & 0x0F; x = (x ^ (x >> 4)) & 0x0F;
return (0x6996 >> x) & 1; return (0x6996 >> x) & 1;
} }
/*! \brief Find the parity of a 32 bit word. /*! \brief Find the parity of a 32 bit word.
...@@ -225,10 +218,10 @@ static __inline__ int parity16(uint16_t x) ...@@ -225,10 +218,10 @@ static __inline__ int parity16(uint16_t x)
\return 1 for odd, or 0 for even. */ \return 1 for odd, or 0 for even. */
static __inline__ int parity32(uint32_t x) static __inline__ int parity32(uint32_t x)
{ {
x ^= (x >> 16); x ^= (x >> 16);
x ^= (x >> 8); x ^= (x >> 8);
x = (x ^ (x >> 4)) & 0x0F; x = (x ^ (x >> 4)) & 0x0F;
return (0x6996 >> x) & 1; return (0x6996 >> x) & 1;
} }
#endif #endif
......
...@@ -74,7 +74,6 @@ ...@@ -74,7 +74,6 @@
Steve also has some nice notes on echo cancellers in echo.h Steve also has some nice notes on echo cancellers in echo.h
References: References:
[1] Ochiai, Areseki, and Ogihara, "Echo Canceller with Two Echo [1] Ochiai, Areseki, and Ogihara, "Echo Canceller with Two Echo
...@@ -105,7 +104,7 @@ ...@@ -105,7 +104,7 @@
Mark, Pawel, and Pavel. Mark, Pawel, and Pavel.
*/ */
#include <linux/kernel.h> /* We're doing kernel work */ #include <linux/kernel.h> /* We're doing kernel work */
#include <linux/module.h> #include <linux/module.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/slab.h> #include <linux/slab.h>
...@@ -115,8 +114,8 @@ ...@@ -115,8 +114,8 @@
#define MIN_TX_POWER_FOR_ADAPTION 64 #define MIN_TX_POWER_FOR_ADAPTION 64
#define MIN_RX_POWER_FOR_ADAPTION 64 #define MIN_RX_POWER_FOR_ADAPTION 64
#define DTD_HANGOVER 600 /* 600 samples, or 75ms */ #define DTD_HANGOVER 600 /* 600 samples, or 75ms */
#define DC_LOG2BETA 3 /* log2() of DC filter Beta */ #define DC_LOG2BETA 3 /* log2() of DC filter Beta */
/*-----------------------------------------------------------------------*\ /*-----------------------------------------------------------------------*\
FUNCTIONS FUNCTIONS
...@@ -124,59 +123,58 @@ ...@@ -124,59 +123,58 @@
/* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */ /* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */
#ifdef __bfin__ #ifdef __bfin__
static void __inline__ lms_adapt_bg(struct oslec_state *ec, int clean, int shift) static void __inline__ lms_adapt_bg(struct oslec_state *ec, int clean,
int shift)
{ {
int i, j; int i, j;
int offset1; int offset1;
int offset2; int offset2;
int factor; int factor;
int exp; int exp;
int16_t *phist; int16_t *phist;
int n; int n;
if (shift > 0) if (shift > 0)
factor = clean << shift; factor = clean << shift;
else else
factor = clean >> -shift; factor = clean >> -shift;
/* Update the FIR taps */ /* Update the FIR taps */
offset2 = ec->curr_pos; offset2 = ec->curr_pos;
offset1 = ec->taps - offset2; offset1 = ec->taps - offset2;
phist = &ec->fir_state_bg.history[offset2]; phist = &ec->fir_state_bg.history[offset2];
/* st: and en: help us locate the assembler in echo.s */ /* st: and en: help us locate the assembler in echo.s */
//asm("st:"); //asm("st:");
n = ec->taps; n = ec->taps;
for (i = 0, j = offset2; i < n; i++, j++) for (i = 0, j = offset2; i < n; i++, j++) {
{ exp = *phist++ * factor;
exp = *phist++ * factor; ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15); }
} //asm("en:");
//asm("en:");
/* Note the asm for the inner loop above generated by Blackfin gcc
/* Note the asm for the inner loop above generated by Blackfin gcc 4.1.1 is pretty good (note even parallel instructions used):
4.1.1 is pretty good (note even parallel instructions used):
R0 = W [P0++] (X);
R0 = W [P0++] (X); R0 *= R2;
R0 *= R2; R0 = R0 + R3 (NS) ||
R0 = R0 + R3 (NS) || R1 = W [P1] (X) ||
R1 = W [P1] (X) || nop;
nop; R0 >>>= 15;
R0 >>>= 15; R0 = R0 + R1;
R0 = R0 + R1; W [P1++] = R0;
W [P1++] = R0;
A block based update algorithm would be much faster but the
A block based update algorithm would be much faster but the above can't be improved on much. Every instruction saved in
above can't be improved on much. Every instruction saved in the loop above is 2 MIPs/ch! The for loop above is where the
the loop above is 2 MIPs/ch! The for loop above is where the Blackfin spends most of it's time - about 17 MIPs/ch measured
Blackfin spends most of it's time - about 17 MIPs/ch measured with speedtest.c with 256 taps (32ms). Write-back and
with speedtest.c with 256 taps (32ms). Write-back and Write-through cache gave about the same performance.
Write-through cache gave about the same performance. */
*/
} }
/* /*
...@@ -198,94 +196,90 @@ static void __inline__ lms_adapt_bg(struct oslec_state *ec, int clean, int shift ...@@ -198,94 +196,90 @@ static void __inline__ lms_adapt_bg(struct oslec_state *ec, int clean, int shift
*/ */
#else #else
static __inline__ void lms_adapt_bg(struct oslec_state *ec, int clean, int shift) static __inline__ void lms_adapt_bg(struct oslec_state *ec, int clean,
int shift)
{ {
int i; int i;
int offset1; int offset1;
int offset2; int offset2;
int factor; int factor;
int exp; int exp;
if (shift > 0) if (shift > 0)
factor = clean << shift; factor = clean << shift;
else else
factor = clean >> -shift; factor = clean >> -shift;
/* Update the FIR taps */ /* Update the FIR taps */
offset2 = ec->curr_pos; offset2 = ec->curr_pos;
offset1 = ec->taps - offset2; offset1 = ec->taps - offset2;
for (i = ec->taps - 1; i >= offset1; i--) for (i = ec->taps - 1; i >= offset1; i--) {
{ exp = (ec->fir_state_bg.history[i - offset1] * factor);
exp = (ec->fir_state_bg.history[i - offset1]*factor); ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15); }
} for (; i >= 0; i--) {
for ( ; i >= 0; i--) exp = (ec->fir_state_bg.history[i + offset2] * factor);
{ ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
exp = (ec->fir_state_bg.history[i + offset2]*factor); }
ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
}
} }
#endif #endif
struct oslec_state *oslec_create(int len, int adaption_mode) struct oslec_state *oslec_create(int len, int adaption_mode)
{ {
struct oslec_state *ec; struct oslec_state *ec;
int i; int i;
ec = kzalloc(sizeof(*ec), GFP_KERNEL); ec = kzalloc(sizeof(*ec), GFP_KERNEL);
if (!ec) if (!ec)
return NULL; return NULL;
ec->taps = len; ec->taps = len;
ec->log2taps = top_bit(len); ec->log2taps = top_bit(len);
ec->curr_pos = ec->taps - 1; ec->curr_pos = ec->taps - 1;
for (i = 0; i < 2; i++) { for (i = 0; i < 2; i++) {
ec->fir_taps16[i] = kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL); ec->fir_taps16[i] =
if (!ec->fir_taps16[i]) kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
goto error_oom; if (!ec->fir_taps16[i])
} goto error_oom;
}
fir16_create(&ec->fir_state,
ec->fir_taps16[0], fir16_create(&ec->fir_state, ec->fir_taps16[0], ec->taps);
ec->taps); fir16_create(&ec->fir_state_bg, ec->fir_taps16[1], ec->taps);
fir16_create(&ec->fir_state_bg,
ec->fir_taps16[1], for (i = 0; i < 5; i++) {
ec->taps); ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0;
}
for(i=0; i<5; i++) {
ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0; ec->cng_level = 1000;
} oslec_adaption_mode(ec, adaption_mode);
ec->cng_level = 1000; ec->snapshot = kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
oslec_adaption_mode(ec, adaption_mode); if (!ec->snapshot)
goto error_oom;
ec->snapshot = kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
if (!ec->snapshot) ec->cond_met = 0;
goto error_oom; ec->Pstates = 0;
ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
ec->cond_met = 0; ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
ec->Pstates = 0; ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0; ec->Lbgn = ec->Lbgn_acc = 0;
ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0; ec->Lbgn_upper = 200;
ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0; ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
ec->Lbgn = ec->Lbgn_acc = 0;
ec->Lbgn_upper = 200; return ec;
ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
error_oom:
return ec; for (i = 0; i < 2; i++)
kfree(ec->fir_taps16[i]);
error_oom:
for (i = 0; i < 2; i++) kfree(ec);
kfree(ec->fir_taps16[i]); return NULL;
kfree(ec);
return NULL;
} }
EXPORT_SYMBOL_GPL(oslec_create); EXPORT_SYMBOL_GPL(oslec_create);
void oslec_free(struct oslec_state *ec) void oslec_free(struct oslec_state *ec)
...@@ -294,293 +288,300 @@ void oslec_free(struct oslec_state *ec) ...@@ -294,293 +288,300 @@ void oslec_free(struct oslec_state *ec)
fir16_free(&ec->fir_state); fir16_free(&ec->fir_state);
fir16_free(&ec->fir_state_bg); fir16_free(&ec->fir_state_bg);
for (i = 0; i < 2; i++) for (i = 0; i < 2; i++)
kfree(ec->fir_taps16[i]); kfree(ec->fir_taps16[i]);
kfree(ec->snapshot); kfree(ec->snapshot);
kfree(ec); kfree(ec);
} }
EXPORT_SYMBOL_GPL(oslec_free); EXPORT_SYMBOL_GPL(oslec_free);
void oslec_adaption_mode(struct oslec_state *ec, int adaption_mode) void oslec_adaption_mode(struct oslec_state *ec, int adaption_mode)
{ {
ec->adaption_mode = adaption_mode; ec->adaption_mode = adaption_mode;
} }
EXPORT_SYMBOL_GPL(oslec_adaption_mode); EXPORT_SYMBOL_GPL(oslec_adaption_mode);
void oslec_flush(struct oslec_state *ec) void oslec_flush(struct oslec_state *ec)
{ {
int i; int i;
ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0; ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0; ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0; ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
ec->Lbgn = ec->Lbgn_acc = 0; ec->Lbgn = ec->Lbgn_acc = 0;
ec->Lbgn_upper = 200; ec->Lbgn_upper = 200;
ec->Lbgn_upper_acc = ec->Lbgn_upper << 13; ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
ec->nonupdate_dwell = 0; ec->nonupdate_dwell = 0;
fir16_flush(&ec->fir_state); fir16_flush(&ec->fir_state);
fir16_flush(&ec->fir_state_bg); fir16_flush(&ec->fir_state_bg);
ec->fir_state.curr_pos = ec->taps - 1; ec->fir_state.curr_pos = ec->taps - 1;
ec->fir_state_bg.curr_pos = ec->taps - 1; ec->fir_state_bg.curr_pos = ec->taps - 1;
for (i = 0; i < 2; i++) for (i = 0; i < 2; i++)
memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t)); memset(ec->fir_taps16[i], 0, ec->taps * sizeof(int16_t));
ec->curr_pos = ec->taps - 1; ec->curr_pos = ec->taps - 1;
ec->Pstates = 0; ec->Pstates = 0;
} }
EXPORT_SYMBOL_GPL(oslec_flush); EXPORT_SYMBOL_GPL(oslec_flush);
void oslec_snapshot(struct oslec_state *ec) { void oslec_snapshot(struct oslec_state *ec)
memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps*sizeof(int16_t)); {
memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps * sizeof(int16_t));
} }
EXPORT_SYMBOL_GPL(oslec_snapshot); EXPORT_SYMBOL_GPL(oslec_snapshot);
/* Dual Path Echo Canceller ------------------------------------------------*/ /* Dual Path Echo Canceller ------------------------------------------------*/
int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx) int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx)
{ {
int32_t echo_value; int32_t echo_value;
int clean_bg; int clean_bg;
int tmp, tmp1; int tmp, tmp1;
/* Input scaling was found be required to prevent problems when tx /* Input scaling was found be required to prevent problems when tx
starts clipping. Another possible way to handle this would be the starts clipping. Another possible way to handle this would be the
filter coefficent scaling. */ filter coefficent scaling. */
ec->tx = tx; ec->rx = rx; ec->tx = tx;
tx >>=1; ec->rx = rx;
rx >>=1; tx >>= 1;
rx >>= 1;
/*
Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required /*
otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta) Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required
only real axis. Some chip sets (like Si labs) don't need otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta)
this, but something like a $10 X100P card does. Any DC really slows only real axis. Some chip sets (like Si labs) don't need
down convergence. this, but something like a $10 X100P card does. Any DC really slows
down convergence.
Note: removes some low frequency from the signal, this reduces
the speech quality when listening to samples through headphones Note: removes some low frequency from the signal, this reduces
but may not be obvious through a telephone handset. the speech quality when listening to samples through headphones
but may not be obvious through a telephone handset.
Note that the 3dB frequency in radians is approx Beta, e.g. for
Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz. Note that the 3dB frequency in radians is approx Beta, e.g. for
*/ Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz.
*/
if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) {
tmp = rx << 15; if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) {
tmp = rx << 15;
#if 1 #if 1
/* Make sure the gain of the HPF is 1.0. This can still saturate a little under /* Make sure the gain of the HPF is 1.0. This can still saturate a little under
impulse conditions, and it might roll to 32768 and need clipping on sustained peak impulse conditions, and it might roll to 32768 and need clipping on sustained peak
level signals. However, the scale of such clipping is small, and the error due to level signals. However, the scale of such clipping is small, and the error due to
any saturation should not markedly affect the downstream processing. */ any saturation should not markedly affect the downstream processing. */
tmp -= (tmp >> 4); tmp -= (tmp >> 4);
#endif #endif
ec->rx_1 += -(ec->rx_1>>DC_LOG2BETA) + tmp - ec->rx_2; ec->rx_1 += -(ec->rx_1 >> DC_LOG2BETA) + tmp - ec->rx_2;
/* hard limit filter to prevent clipping. Note that at this stage
rx should be limited to +/- 16383 due to right shift above */
tmp1 = ec->rx_1 >> 15;
if (tmp1 > 16383)
tmp1 = 16383;
if (tmp1 < -16383)
tmp1 = -16383;
rx = tmp1;
ec->rx_2 = tmp;
}
/* hard limit filter to prevent clipping. Note that at this stage /* Block average of power in the filter states. Used for
rx should be limited to +/- 16383 due to right shift above */ adaption power calculation. */
tmp1 = ec->rx_1 >> 15;
if (tmp1 > 16383) tmp1 = 16383;
if (tmp1 < -16383) tmp1 = -16383;
rx = tmp1;
ec->rx_2 = tmp;
}
/* Block average of power in the filter states. Used for {
adaption power calculation. */ int new, old;
/* efficient "out with the old and in with the new" algorithm so
we don't have to recalculate over the whole block of
samples. */
new = (int)tx *(int)tx;
old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
(int)ec->fir_state.history[ec->fir_state.curr_pos];
ec->Pstates +=
((new - old) + (1 << ec->log2taps)) >> ec->log2taps;
if (ec->Pstates < 0)
ec->Pstates = 0;
}
{ /* Calculate short term average levels using simple single pole IIRs */
int new, old;
/* efficient "out with the old and in with the new" algorithm so ec->Ltxacc += abs(tx) - ec->Ltx;
we don't have to recalculate over the whole block of ec->Ltx = (ec->Ltxacc + (1 << 4)) >> 5;
samples. */ ec->Lrxacc += abs(rx) - ec->Lrx;
new = (int)tx * (int)tx; ec->Lrx = (ec->Lrxacc + (1 << 4)) >> 5;
old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
(int)ec->fir_state.history[ec->fir_state.curr_pos];
ec->Pstates += ((new - old) + (1<<ec->log2taps)) >> ec->log2taps;
if (ec->Pstates < 0) ec->Pstates = 0;
}
/* Calculate short term average levels using simple single pole IIRs */
ec->Ltxacc += abs(tx) - ec->Ltx;
ec->Ltx = (ec->Ltxacc + (1<<4)) >> 5;
ec->Lrxacc += abs(rx) - ec->Lrx;
ec->Lrx = (ec->Lrxacc + (1<<4)) >> 5;
/* Foreground filter ---------------------------------------------------*/
ec->fir_state.coeffs = ec->fir_taps16[0];
echo_value = fir16(&ec->fir_state, tx);
ec->clean = rx - echo_value;
ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
ec->Lclean = (ec->Lcleanacc + (1<<4)) >> 5;
/* Background filter ---------------------------------------------------*/
echo_value = fir16(&ec->fir_state_bg, tx);
clean_bg = rx - echo_value;
ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
ec->Lclean_bg = (ec->Lclean_bgacc + (1<<4)) >> 5;
/* Background Filter adaption -----------------------------------------*/
/* Almost always adap bg filter, just simple DT and energy
detection to minimise adaption in cases of strong double talk.
However this is not critical for the dual path algorithm.
*/
ec->factor = 0;
ec->shift = 0;
if ((ec->nonupdate_dwell == 0)) {
int P, logP, shift;
/* Determine:
f = Beta * clean_bg_rx/P ------ (1)
where P is the total power in the filter states.
The Boffins have shown that if we obey (1) we converge
quickly and avoid instability.
The correct factor f must be in Q30, as this is the fixed
point format required by the lms_adapt_bg() function,
therefore the scaled version of (1) is:
(2^30) * f = (2^30) * Beta * clean_bg_rx/P
factor = (2^30) * Beta * clean_bg_rx/P ----- (2)
We have chosen Beta = 0.25 by experiment, so:
factor = (2^30) * (2^-2) * clean_bg_rx/P
(30 - 2 - log2(P))
factor = clean_bg_rx 2 ----- (3)
To avoid a divide we approximate log2(P) as top_bit(P),
which returns the position of the highest non-zero bit in
P. This approximation introduces an error as large as a
factor of 2, but the algorithm seems to handle it OK.
Come to think of it a divide may not be a big deal on a
modern DSP, so its probably worth checking out the cycles
for a divide versus a top_bit() implementation.
*/
P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates;
logP = top_bit(P) + ec->log2taps;
shift = 30 - 2 - logP;
ec->shift = shift;
lms_adapt_bg(ec, clean_bg, shift);
}
/* very simple DTD to make sure we dont try and adapt with strong
near end speech */
ec->adapt = 0;
if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
ec->nonupdate_dwell = DTD_HANGOVER;
if (ec->nonupdate_dwell)
ec->nonupdate_dwell--;
/* Transfer logic ------------------------------------------------------*/ /* Foreground filter --------------------------------------------------- */
/* These conditions are from the dual path paper [1], I messed with ec->fir_state.coeffs = ec->fir_taps16[0];
them a bit to improve performance. */ echo_value = fir16(&ec->fir_state, tx);
ec->clean = rx - echo_value;
ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
ec->Lclean = (ec->Lcleanacc + (1 << 4)) >> 5;
if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) && /* Background filter --------------------------------------------------- */
(ec->nonupdate_dwell == 0) &&
(8*ec->Lclean_bg < 7*ec->Lclean) /* (ec->Lclean_bg < 0.875*ec->Lclean) */ &&
(8*ec->Lclean_bg < ec->Ltx) /* (ec->Lclean_bg < 0.125*ec->Ltx) */ )
{
if (ec->cond_met == 6) {
/* BG filter has had better results for 6 consecutive samples */
ec->adapt = 1;
memcpy(ec->fir_taps16[0], ec->fir_taps16[1], ec->taps*sizeof(int16_t));
}
else
ec->cond_met++;
}
else
ec->cond_met = 0;
/* Non-Linear Processing ---------------------------------------------------*/ echo_value = fir16(&ec->fir_state_bg, tx);
clean_bg = rx - echo_value;
ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
ec->Lclean_bg = (ec->Lclean_bgacc + (1 << 4)) >> 5;
ec->clean_nlp = ec->clean; /* Background Filter adaption ----------------------------------------- */
if (ec->adaption_mode & ECHO_CAN_USE_NLP)
{
/* Non-linear processor - a fancy way to say "zap small signals, to avoid
residual echo due to (uLaw/ALaw) non-linearity in the channel.". */
if ((16*ec->Lclean < ec->Ltx)) /* Almost always adap bg filter, just simple DT and energy
{ detection to minimise adaption in cases of strong double talk.
/* Our e/c has improved echo by at least 24 dB (each factor of 2 is 6dB, However this is not critical for the dual path algorithm.
so 2*2*2*2=16 is the same as 6+6+6+6=24dB) */ */
if (ec->adaption_mode & ECHO_CAN_USE_CNG) ec->factor = 0;
{ ec->shift = 0;
ec->cng_level = ec->Lbgn; if ((ec->nonupdate_dwell == 0)) {
int P, logP, shift;
/* Very elementary comfort noise generation. Just random
numbers rolled off very vaguely Hoth-like. DR: This /* Determine:
noise doesn't sound quite right to me - I suspect there
are some overlfow issues in the filtering as it's too f = Beta * clean_bg_rx/P ------ (1)
"crackly". TODO: debug this, maybe just play noise at
high level or look at spectrum. where P is the total power in the filter states.
*/
The Boffins have shown that if we obey (1) we converge
ec->cng_rndnum = 1664525U*ec->cng_rndnum + 1013904223U; quickly and avoid instability.
ec->cng_filter = ((ec->cng_rndnum & 0xFFFF) - 32768 + 5*ec->cng_filter) >> 3;
ec->clean_nlp = (ec->cng_filter*ec->cng_level*8) >> 14; The correct factor f must be in Q30, as this is the fixed
point format required by the lms_adapt_bg() function,
} therefore the scaled version of (1) is:
else if (ec->adaption_mode & ECHO_CAN_USE_CLIP)
{ (2^30) * f = (2^30) * Beta * clean_bg_rx/P
/* This sounds much better than CNG */ factor = (2^30) * Beta * clean_bg_rx/P ----- (2)
if (ec->clean_nlp > ec->Lbgn)
ec->clean_nlp = ec->Lbgn; We have chosen Beta = 0.25 by experiment, so:
if (ec->clean_nlp < -ec->Lbgn)
ec->clean_nlp = -ec->Lbgn; factor = (2^30) * (2^-2) * clean_bg_rx/P
(30 - 2 - log2(P))
factor = clean_bg_rx 2 ----- (3)
To avoid a divide we approximate log2(P) as top_bit(P),
which returns the position of the highest non-zero bit in
P. This approximation introduces an error as large as a
factor of 2, but the algorithm seems to handle it OK.
Come to think of it a divide may not be a big deal on a
modern DSP, so its probably worth checking out the cycles
for a divide versus a top_bit() implementation.
*/
P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates;
logP = top_bit(P) + ec->log2taps;
shift = 30 - 2 - logP;
ec->shift = shift;
lms_adapt_bg(ec, clean_bg, shift);
} }
else
{ /* very simple DTD to make sure we dont try and adapt with strong
/* just mute the residual, doesn't sound very good, used mainly near end speech */
in G168 tests */
ec->clean_nlp = 0; ec->adapt = 0;
} if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
} ec->nonupdate_dwell = DTD_HANGOVER;
else { if (ec->nonupdate_dwell)
/* Background noise estimator. I tried a few algorithms ec->nonupdate_dwell--;
here without much luck. This very simple one seems to
work best, we just average the level using a slow (1 sec /* Transfer logic ------------------------------------------------------ */
time const) filter if the current level is less than a
(experimentally derived) constant. This means we dont /* These conditions are from the dual path paper [1], I messed with
include high level signals like near end speech. When them a bit to improve performance. */
combined with CNG or especially CLIP seems to work OK.
*/ if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
if (ec->Lclean < 40) { (ec->nonupdate_dwell == 0) &&
ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn; (8 * ec->Lclean_bg <
ec->Lbgn = (ec->Lbgn_acc + (1<<11)) >> 12; 7 * ec->Lclean) /* (ec->Lclean_bg < 0.875*ec->Lclean) */ &&
} (8 * ec->Lclean_bg <
} ec->Ltx) /* (ec->Lclean_bg < 0.125*ec->Ltx) */ ) {
} if (ec->cond_met == 6) {
/* BG filter has had better results for 6 consecutive samples */
/* Roll around the taps buffer */ ec->adapt = 1;
if (ec->curr_pos <= 0) memcpy(ec->fir_taps16[0], ec->fir_taps16[1],
ec->curr_pos = ec->taps; ec->taps * sizeof(int16_t));
ec->curr_pos--; } else
ec->cond_met++;
if (ec->adaption_mode & ECHO_CAN_DISABLE) } else
ec->clean_nlp = rx; ec->cond_met = 0;
/* Output scaled back up again to match input scaling */ /* Non-Linear Processing --------------------------------------------------- */
return (int16_t) ec->clean_nlp << 1; ec->clean_nlp = ec->clean;
if (ec->adaption_mode & ECHO_CAN_USE_NLP) {
/* Non-linear processor - a fancy way to say "zap small signals, to avoid
residual echo due to (uLaw/ALaw) non-linearity in the channel.". */
if ((16 * ec->Lclean < ec->Ltx)) {
/* Our e/c has improved echo by at least 24 dB (each factor of 2 is 6dB,
so 2*2*2*2=16 is the same as 6+6+6+6=24dB) */
if (ec->adaption_mode & ECHO_CAN_USE_CNG) {
ec->cng_level = ec->Lbgn;
/* Very elementary comfort noise generation. Just random
numbers rolled off very vaguely Hoth-like. DR: This
noise doesn't sound quite right to me - I suspect there
are some overlfow issues in the filtering as it's too
"crackly". TODO: debug this, maybe just play noise at
high level or look at spectrum.
*/
ec->cng_rndnum =
1664525U * ec->cng_rndnum + 1013904223U;
ec->cng_filter =
((ec->cng_rndnum & 0xFFFF) - 32768 +
5 * ec->cng_filter) >> 3;
ec->clean_nlp =
(ec->cng_filter * ec->cng_level * 8) >> 14;
} else if (ec->adaption_mode & ECHO_CAN_USE_CLIP) {
/* This sounds much better than CNG */
if (ec->clean_nlp > ec->Lbgn)
ec->clean_nlp = ec->Lbgn;
if (ec->clean_nlp < -ec->Lbgn)
ec->clean_nlp = -ec->Lbgn;
} else {
/* just mute the residual, doesn't sound very good, used mainly
in G168 tests */
ec->clean_nlp = 0;
}
} else {
/* Background noise estimator. I tried a few algorithms
here without much luck. This very simple one seems to
work best, we just average the level using a slow (1 sec
time const) filter if the current level is less than a
(experimentally derived) constant. This means we dont
include high level signals like near end speech. When
combined with CNG or especially CLIP seems to work OK.
*/
if (ec->Lclean < 40) {
ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn;
ec->Lbgn = (ec->Lbgn_acc + (1 << 11)) >> 12;
}
}
}
/* Roll around the taps buffer */
if (ec->curr_pos <= 0)
ec->curr_pos = ec->taps;
ec->curr_pos--;
if (ec->adaption_mode & ECHO_CAN_DISABLE)
ec->clean_nlp = rx;
/* Output scaled back up again to match input scaling */
return (int16_t) ec->clean_nlp << 1;
} }
EXPORT_SYMBOL_GPL(oslec_update); EXPORT_SYMBOL_GPL(oslec_update);
/* This function is seperated from the echo canceller is it is usually called /* This function is seperated from the echo canceller is it is usually called
...@@ -604,28 +605,32 @@ EXPORT_SYMBOL_GPL(oslec_update); ...@@ -604,28 +605,32 @@ EXPORT_SYMBOL_GPL(oslec_update);
precision, which noise shapes things, giving very clean DC removal. precision, which noise shapes things, giving very clean DC removal.
*/ */
int16_t oslec_hpf_tx(struct oslec_state *ec, int16_t tx) { int16_t oslec_hpf_tx(struct oslec_state * ec, int16_t tx)
int tmp, tmp1; {
int tmp, tmp1;
if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) { if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) {
tmp = tx << 15; tmp = tx << 15;
#if 1 #if 1
/* Make sure the gain of the HPF is 1.0. The first can still saturate a little under /* Make sure the gain of the HPF is 1.0. The first can still saturate a little under
impulse conditions, and it might roll to 32768 and need clipping on sustained peak impulse conditions, and it might roll to 32768 and need clipping on sustained peak
level signals. However, the scale of such clipping is small, and the error due to level signals. However, the scale of such clipping is small, and the error due to
any saturation should not markedly affect the downstream processing. */ any saturation should not markedly affect the downstream processing. */
tmp -= (tmp >> 4); tmp -= (tmp >> 4);
#endif #endif
ec->tx_1 += -(ec->tx_1>>DC_LOG2BETA) + tmp - ec->tx_2; ec->tx_1 += -(ec->tx_1 >> DC_LOG2BETA) + tmp - ec->tx_2;
tmp1 = ec->tx_1 >> 15; tmp1 = ec->tx_1 >> 15;
if (tmp1 > 32767) tmp1 = 32767; if (tmp1 > 32767)
if (tmp1 < -32767) tmp1 = -32767; tmp1 = 32767;
tx = tmp1; if (tmp1 < -32767)
ec->tx_2 = tmp; tmp1 = -32767;
} tx = tmp1;
ec->tx_2 = tmp;
return tx; }
return tx;
} }
EXPORT_SYMBOL_GPL(oslec_hpf_tx); EXPORT_SYMBOL_GPL(oslec_hpf_tx);
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
......
...@@ -124,9 +124,8 @@ a minor burden. ...@@ -124,9 +124,8 @@ a minor burden.
G.168 echo canceller descriptor. This defines the working state for a line G.168 echo canceller descriptor. This defines the working state for a line
echo canceller. echo canceller.
*/ */
struct oslec_state struct oslec_state {
{ int16_t tx, rx;
int16_t tx,rx;
int16_t clean; int16_t clean;
int16_t clean_nlp; int16_t clean_nlp;
...@@ -170,4 +169,4 @@ struct oslec_state ...@@ -170,4 +169,4 @@ struct oslec_state
int16_t *snapshot; int16_t *snapshot;
}; };
#endif /* __ECHO_H */ #endif /* __ECHO_H */
...@@ -72,8 +72,7 @@ ...@@ -72,8 +72,7 @@
16 bit integer FIR descriptor. This defines the working state for a single 16 bit integer FIR descriptor. This defines the working state for a single
instance of an FIR filter using 16 bit integer coefficients. instance of an FIR filter using 16 bit integer coefficients.
*/ */
typedef struct typedef struct {
{
int taps; int taps;
int curr_pos; int curr_pos;
const int16_t *coeffs; const int16_t *coeffs;
...@@ -85,8 +84,7 @@ typedef struct ...@@ -85,8 +84,7 @@ typedef struct
instance of an FIR filter using 32 bit integer coefficients, and filtering instance of an FIR filter using 32 bit integer coefficients, and filtering
16 bit integer data. 16 bit integer data.
*/ */
typedef struct typedef struct {
{
int taps; int taps;
int curr_pos; int curr_pos;
const int32_t *coeffs; const int32_t *coeffs;
...@@ -97,39 +95,37 @@ typedef struct ...@@ -97,39 +95,37 @@ typedef struct
Floating point FIR descriptor. This defines the working state for a single Floating point FIR descriptor. This defines the working state for a single
instance of an FIR filter using floating point coefficients and data. instance of an FIR filter using floating point coefficients and data.
*/ */
typedef struct typedef struct {
{
int taps; int taps;
int curr_pos; int curr_pos;
const float *coeffs; const float *coeffs;
float *history; float *history;
} fir_float_state_t; } fir_float_state_t;
static __inline__ const int16_t *fir16_create(fir16_state_t *fir, static __inline__ const int16_t *fir16_create(fir16_state_t * fir,
const int16_t *coeffs, const int16_t * coeffs, int taps)
int taps)
{ {
fir->taps = taps; fir->taps = taps;
fir->curr_pos = taps - 1; fir->curr_pos = taps - 1;
fir->coeffs = coeffs; fir->coeffs = coeffs;
#if defined(USE_MMX) || defined(USE_SSE2) || defined(__bfin__) #if defined(USE_MMX) || defined(USE_SSE2) || defined(__bfin__)
fir->history = kcalloc(2*taps, sizeof(int16_t), GFP_KERNEL); fir->history = kcalloc(2 * taps, sizeof(int16_t), GFP_KERNEL);
#else #else
fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL); fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL);
#endif #endif
return fir->history; return fir->history;
} }
static __inline__ void fir16_flush(fir16_state_t *fir) static __inline__ void fir16_flush(fir16_state_t * fir)
{ {
#if defined(USE_MMX) || defined(USE_SSE2) || defined(__bfin__) #if defined(USE_MMX) || defined(USE_SSE2) || defined(__bfin__)
memset(fir->history, 0, 2*fir->taps*sizeof(int16_t)); memset(fir->history, 0, 2 * fir->taps * sizeof(int16_t));
#else #else
memset(fir->history, 0, fir->taps*sizeof(int16_t)); memset(fir->history, 0, fir->taps * sizeof(int16_t));
#endif #endif
} }
static __inline__ void fir16_free(fir16_state_t *fir) static __inline__ void fir16_free(fir16_state_t * fir)
{ {
kfree(fir->history); kfree(fir->history);
} }
...@@ -137,166 +133,162 @@ static __inline__ void fir16_free(fir16_state_t *fir) ...@@ -137,166 +133,162 @@ static __inline__ void fir16_free(fir16_state_t *fir)
#ifdef __bfin__ #ifdef __bfin__
static inline int32_t dot_asm(short *x, short *y, int len) static inline int32_t dot_asm(short *x, short *y, int len)
{ {
int dot; int dot;
len--; len--;
__asm__ __asm__("I0 = %1;\n\t"
( "I1 = %2;\n\t"
"I0 = %1;\n\t" "A0 = 0;\n\t"
"I1 = %2;\n\t" "R0.L = W[I0++] || R1.L = W[I1++];\n\t"
"A0 = 0;\n\t" "LOOP dot%= LC0 = %3;\n\t"
"R0.L = W[I0++] || R1.L = W[I1++];\n\t" "LOOP_BEGIN dot%=;\n\t"
"LOOP dot%= LC0 = %3;\n\t" "A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t"
"LOOP_BEGIN dot%=;\n\t" "LOOP_END dot%=;\n\t"
"A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t" "A0 += R0.L*R1.L (IS);\n\t"
"LOOP_END dot%=;\n\t" "R0 = A0;\n\t"
"A0 += R0.L*R1.L (IS);\n\t" "%0 = R0;\n\t"
"R0 = A0;\n\t" :"=&d"(dot)
"%0 = R0;\n\t" :"a"(x), "a"(y), "a"(len)
: "=&d" (dot) :"I0", "I1", "A1", "A0", "R0", "R1"
: "a" (x), "a" (y), "a" (len) );
: "I0", "I1", "A1", "A0", "R0", "R1"
); return dot;
return dot;
} }
#endif #endif
static __inline__ int16_t fir16(fir16_state_t *fir, int16_t sample) static __inline__ int16_t fir16(fir16_state_t * fir, int16_t sample)
{ {
int32_t y; int32_t y;
#if defined(USE_MMX) #if defined(USE_MMX)
int i; int i;
mmx_t *mmx_coeffs; mmx_t *mmx_coeffs;
mmx_t *mmx_hist; mmx_t *mmx_hist;
fir->history[fir->curr_pos] = sample; fir->history[fir->curr_pos] = sample;
fir->history[fir->curr_pos + fir->taps] = sample; fir->history[fir->curr_pos + fir->taps] = sample;
mmx_coeffs = (mmx_t *) fir->coeffs; mmx_coeffs = (mmx_t *) fir->coeffs;
mmx_hist = (mmx_t *) &fir->history[fir->curr_pos]; mmx_hist = (mmx_t *) & fir->history[fir->curr_pos];
i = fir->taps; i = fir->taps;
pxor_r2r(mm4, mm4); pxor_r2r(mm4, mm4);
/* 8 samples per iteration, so the filter must be a multiple of 8 long. */ /* 8 samples per iteration, so the filter must be a multiple of 8 long. */
while (i > 0) while (i > 0) {
{ movq_m2r(mmx_coeffs[0], mm0);
movq_m2r(mmx_coeffs[0], mm0); movq_m2r(mmx_coeffs[1], mm2);
movq_m2r(mmx_coeffs[1], mm2); movq_m2r(mmx_hist[0], mm1);
movq_m2r(mmx_hist[0], mm1); movq_m2r(mmx_hist[1], mm3);
movq_m2r(mmx_hist[1], mm3); mmx_coeffs += 2;
mmx_coeffs += 2; mmx_hist += 2;
mmx_hist += 2; pmaddwd_r2r(mm1, mm0);
pmaddwd_r2r(mm1, mm0); pmaddwd_r2r(mm3, mm2);
pmaddwd_r2r(mm3, mm2); paddd_r2r(mm0, mm4);
paddd_r2r(mm0, mm4); paddd_r2r(mm2, mm4);
paddd_r2r(mm2, mm4); i -= 8;
i -= 8; }
} movq_r2r(mm4, mm0);
movq_r2r(mm4, mm0); psrlq_i2r(32, mm0);
psrlq_i2r(32, mm0); paddd_r2r(mm0, mm4);
paddd_r2r(mm0, mm4); movd_r2m(mm4, y);
movd_r2m(mm4, y); emms();
emms();
#elif defined(USE_SSE2) #elif defined(USE_SSE2)
int i; int i;
xmm_t *xmm_coeffs; xmm_t *xmm_coeffs;
xmm_t *xmm_hist; xmm_t *xmm_hist;
fir->history[fir->curr_pos] = sample; fir->history[fir->curr_pos] = sample;
fir->history[fir->curr_pos + fir->taps] = sample; fir->history[fir->curr_pos + fir->taps] = sample;
xmm_coeffs = (xmm_t *) fir->coeffs; xmm_coeffs = (xmm_t *) fir->coeffs;
xmm_hist = (xmm_t *) &fir->history[fir->curr_pos]; xmm_hist = (xmm_t *) & fir->history[fir->curr_pos];
i = fir->taps; i = fir->taps;
pxor_r2r(xmm4, xmm4); pxor_r2r(xmm4, xmm4);
/* 16 samples per iteration, so the filter must be a multiple of 16 long. */ /* 16 samples per iteration, so the filter must be a multiple of 16 long. */
while (i > 0) while (i > 0) {
{ movdqu_m2r(xmm_coeffs[0], xmm0);
movdqu_m2r(xmm_coeffs[0], xmm0); movdqu_m2r(xmm_coeffs[1], xmm2);
movdqu_m2r(xmm_coeffs[1], xmm2); movdqu_m2r(xmm_hist[0], xmm1);
movdqu_m2r(xmm_hist[0], xmm1); movdqu_m2r(xmm_hist[1], xmm3);
movdqu_m2r(xmm_hist[1], xmm3); xmm_coeffs += 2;
xmm_coeffs += 2; xmm_hist += 2;
xmm_hist += 2; pmaddwd_r2r(xmm1, xmm0);
pmaddwd_r2r(xmm1, xmm0); pmaddwd_r2r(xmm3, xmm2);
pmaddwd_r2r(xmm3, xmm2); paddd_r2r(xmm0, xmm4);
paddd_r2r(xmm0, xmm4); paddd_r2r(xmm2, xmm4);
paddd_r2r(xmm2, xmm4); i -= 16;
i -= 16; }
} movdqa_r2r(xmm4, xmm0);
movdqa_r2r(xmm4, xmm0); psrldq_i2r(8, xmm0);
psrldq_i2r(8, xmm0); paddd_r2r(xmm0, xmm4);
paddd_r2r(xmm0, xmm4); movdqa_r2r(xmm4, xmm0);
movdqa_r2r(xmm4, xmm0); psrldq_i2r(4, xmm0);
psrldq_i2r(4, xmm0); paddd_r2r(xmm0, xmm4);
paddd_r2r(xmm0, xmm4); movd_r2m(xmm4, y);
movd_r2m(xmm4, y);
#elif defined(__bfin__) #elif defined(__bfin__)
fir->history[fir->curr_pos] = sample; fir->history[fir->curr_pos] = sample;
fir->history[fir->curr_pos + fir->taps] = sample; fir->history[fir->curr_pos + fir->taps] = sample;
y = dot_asm((int16_t*)fir->coeffs, &fir->history[fir->curr_pos], fir->taps); y = dot_asm((int16_t *) fir->coeffs, &fir->history[fir->curr_pos],
fir->taps);
#else #else
int i; int i;
int offset1; int offset1;
int offset2; int offset2;
fir->history[fir->curr_pos] = sample; fir->history[fir->curr_pos] = sample;
offset2 = fir->curr_pos; offset2 = fir->curr_pos;
offset1 = fir->taps - offset2; offset1 = fir->taps - offset2;
y = 0; y = 0;
for (i = fir->taps - 1; i >= offset1; i--) for (i = fir->taps - 1; i >= offset1; i--)
y += fir->coeffs[i]*fir->history[i - offset1]; y += fir->coeffs[i] * fir->history[i - offset1];
for ( ; i >= 0; i--) for (; i >= 0; i--)
y += fir->coeffs[i]*fir->history[i + offset2]; y += fir->coeffs[i] * fir->history[i + offset2];
#endif #endif
if (fir->curr_pos <= 0) if (fir->curr_pos <= 0)
fir->curr_pos = fir->taps; fir->curr_pos = fir->taps;
fir->curr_pos--; fir->curr_pos--;
return (int16_t) (y >> 15); return (int16_t) (y >> 15);
} }
static __inline__ const int16_t *fir32_create(fir32_state_t *fir, static __inline__ const int16_t *fir32_create(fir32_state_t * fir,
const int32_t *coeffs, const int32_t * coeffs, int taps)
int taps)
{ {
fir->taps = taps; fir->taps = taps;
fir->curr_pos = taps - 1; fir->curr_pos = taps - 1;
fir->coeffs = coeffs; fir->coeffs = coeffs;
fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL); fir->history = kcalloc(taps, sizeof(int16_t), GFP_KERNEL);
return fir->history; return fir->history;
} }
static __inline__ void fir32_flush(fir32_state_t *fir) static __inline__ void fir32_flush(fir32_state_t * fir)
{ {
memset(fir->history, 0, fir->taps*sizeof(int16_t)); memset(fir->history, 0, fir->taps * sizeof(int16_t));
} }
static __inline__ void fir32_free(fir32_state_t *fir) static __inline__ void fir32_free(fir32_state_t * fir)
{ {
kfree(fir->history); kfree(fir->history);
} }
static __inline__ int16_t fir32(fir32_state_t *fir, int16_t sample) static __inline__ int16_t fir32(fir32_state_t * fir, int16_t sample)
{ {
int i; int i;
int32_t y; int32_t y;
int offset1; int offset1;
int offset2; int offset2;
fir->history[fir->curr_pos] = sample; fir->history[fir->curr_pos] = sample;
offset2 = fir->curr_pos; offset2 = fir->curr_pos;
offset1 = fir->taps - offset2; offset1 = fir->taps - offset2;
y = 0; y = 0;
for (i = fir->taps - 1; i >= offset1; i--) for (i = fir->taps - 1; i >= offset1; i--)
y += fir->coeffs[i]*fir->history[i - offset1]; y += fir->coeffs[i] * fir->history[i - offset1];
for ( ; i >= 0; i--) for (; i >= 0; i--)
y += fir->coeffs[i]*fir->history[i + offset2]; y += fir->coeffs[i] * fir->history[i + offset2];
if (fir->curr_pos <= 0) if (fir->curr_pos <= 0)
fir->curr_pos = fir->taps; fir->curr_pos = fir->taps;
fir->curr_pos--; fir->curr_pos--;
return (int16_t) (y >> 15); return (int16_t) (y >> 15);
} }
#endif #endif
......
...@@ -27,24 +27,23 @@ ...@@ -27,24 +27,23 @@
* values by ULL, lest they be truncated by the compiler) * values by ULL, lest they be truncated by the compiler)
*/ */
typedef union { typedef union {
long long q; /* Quadword (64-bit) value */ long long q; /* Quadword (64-bit) value */
unsigned long long uq; /* Unsigned Quadword */ unsigned long long uq; /* Unsigned Quadword */
int d[2]; /* 2 Doubleword (32-bit) values */ int d[2]; /* 2 Doubleword (32-bit) values */
unsigned int ud[2]; /* 2 Unsigned Doubleword */ unsigned int ud[2]; /* 2 Unsigned Doubleword */
short w[4]; /* 4 Word (16-bit) values */ short w[4]; /* 4 Word (16-bit) values */
unsigned short uw[4]; /* 4 Unsigned Word */ unsigned short uw[4]; /* 4 Unsigned Word */
char b[8]; /* 8 Byte (8-bit) values */ char b[8]; /* 8 Byte (8-bit) values */
unsigned char ub[8]; /* 8 Unsigned Byte */ unsigned char ub[8]; /* 8 Unsigned Byte */
float s[2]; /* Single-precision (32-bit) value */ float s[2]; /* Single-precision (32-bit) value */
} mmx_t; /* On an 8-byte (64-bit) boundary */ } mmx_t; /* On an 8-byte (64-bit) boundary */
/* SSE registers */ /* SSE registers */
typedef union { typedef union {
char b[16]; char b[16];
} xmm_t; } xmm_t;
#define mmx_i2r(op,imm,reg) \ #define mmx_i2r(op,imm,reg) \
__asm__ __volatile__ (#op " %0, %%" #reg \ __asm__ __volatile__ (#op " %0, %%" #reg \
: /* nothing */ \ : /* nothing */ \
...@@ -63,7 +62,6 @@ typedef union { ...@@ -63,7 +62,6 @@ typedef union {
#define mmx_r2r(op,regs,regd) \ #define mmx_r2r(op,regs,regd) \
__asm__ __volatile__ (#op " %" #regs ", %" #regd) __asm__ __volatile__ (#op " %" #regs ", %" #regd)
#define emms() __asm__ __volatile__ ("emms") #define emms() __asm__ __volatile__ ("emms")
#define movd_m2r(var,reg) mmx_m2r (movd, var, reg) #define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
...@@ -192,16 +190,13 @@ typedef union { ...@@ -192,16 +190,13 @@ typedef union {
#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) #define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) #define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
/* 3DNOW extensions */ /* 3DNOW extensions */
#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) #define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg)
#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) #define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd)
/* AMD MMX extensions - also available in intel SSE */ /* AMD MMX extensions - also available in intel SSE */
#define mmx_m2ri(op,mem,reg,imm) \ #define mmx_m2ri(op,mem,reg,imm) \
__asm__ __volatile__ (#op " %1, %0, %%" #reg \ __asm__ __volatile__ (#op " %1, %0, %%" #reg \
: /* nothing */ \ : /* nothing */ \
...@@ -216,7 +211,6 @@ typedef union { ...@@ -216,7 +211,6 @@ typedef union {
: /* nothing */ \ : /* nothing */ \
: "m" (mem)) : "m" (mem))
#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) #define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) #define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
...@@ -284,5 +278,4 @@ typedef union { ...@@ -284,5 +278,4 @@ typedef union {
#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd) #define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd)
#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd) #define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd)
#endif /* AVCODEC_I386MMX_H */ #endif /* AVCODEC_I386MMX_H */
...@@ -83,4 +83,4 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx); ...@@ -83,4 +83,4 @@ int16_t oslec_update(struct oslec_state *ec, int16_t tx, int16_t rx);
*/ */
int16_t oslec_hpf_tx(struct oslec_state *ec, int16_t tx); int16_t oslec_hpf_tx(struct oslec_state *ec, int16_t tx);
#endif /* __OSLEC_H */ #endif /* __OSLEC_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment