Commit c646b61c authored by David S. Miller's avatar David S. Miller

Merge nuts.davemloft.net:/disk1/BK/network-2.6

into nuts.davemloft.net:/disk1/BK/net-2.6
parents 6c4cd043 51c44d07
......@@ -7,6 +7,11 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
* Changes:
*
* Harald Welte: <laforge@gnumonks.org>
* - Add neighbour cache statistics like rtstat
*/
/* The following flags & states are exported to user space,
......@@ -90,12 +95,25 @@ struct neigh_parms
struct neigh_statistics
{
unsigned long allocs;
unsigned long res_failed;
unsigned long rcv_probes_mcast;
unsigned long rcv_probes_ucast;
unsigned long allocs; /* number of allocated neighs */
unsigned long destroys; /* number of destroyed neighs */
unsigned long hash_grows; /* number of hash resizes */
unsigned long res_failed; /* nomber of failed resolutions */
unsigned long lookups; /* number of lookups */
unsigned long hits; /* number of hits (among lookups) */
unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */
unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */
unsigned long periodic_gc_runs; /* number of periodic GC runs */
unsigned long forced_gc_runs; /* number of forced GC runs */
};
#define NEIGH_CACHE_STAT_INC(tbl, field) \
(per_cpu_ptr((tbl)->stats, smp_processor_id())->field++)
struct neighbour
{
struct neighbour *next;
......@@ -172,12 +190,15 @@ struct neigh_table
unsigned long last_rand;
struct neigh_parms *parms_list;
kmem_cache_t *kmem_cachep;
struct neigh_statistics stats;
struct neigh_statistics *stats;
struct neighbour **hash_buckets;
unsigned int hash_mask;
__u32 hash_rnd;
unsigned int hash_chain_gc;
struct pneigh_entry **phash_buckets;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *pde;
#endif
};
/* flags for neigh_update() */
......
......@@ -1180,7 +1180,8 @@ struct tcp_skb_cb {
__u16 urg_ptr; /* Valid w/URG flags is set. */
__u32 ack_seq; /* Sequence number ACK'd */
__u32 tso_factor;
__u16 tso_factor; /* If > 1, TSO frame */
__u16 tso_mss; /* MSS that FACTOR's in terms of*/
};
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
......
......@@ -12,6 +12,7 @@
*
* Fixes:
* Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
* Harald Welte Add neighbour cache statistics like rtstat
*/
#include <linux/config.h>
......@@ -21,6 +22,7 @@
#include <linux/socket.h>
#include <linux/sched.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
......@@ -59,6 +61,7 @@ void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
static int neigh_glbl_allocs;
static struct neigh_table *neigh_tables;
static struct file_operations neigh_stat_seq_fops;
/*
Neighbour hash table buckets are protected with rwlock tbl->lock.
......@@ -116,6 +119,8 @@ static int neigh_forced_gc(struct neigh_table *tbl)
int shrunk = 0;
int i;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
write_lock_bh(&tbl->lock);
for (i = 0; i <= tbl->hash_mask; i++) {
struct neighbour *n, **np;
......@@ -273,7 +278,8 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
init_timer(&n->timer);
n->timer.function = neigh_timer_handler;
n->timer.data = (unsigned long)n;
tbl->stats.allocs++;
NEIGH_CACHE_STAT_INC(tbl, allocs);
neigh_glbl_allocs++;
tbl->entries++;
n->tbl = tbl;
......@@ -315,6 +321,8 @@ static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
struct neighbour **new_hash, **old_hash;
unsigned int i, new_hash_mask, old_entries;
NEIGH_CACHE_STAT_INC(tbl, hash_grows);
BUG_ON(new_entries & (new_entries - 1));
new_hash = neigh_hash_alloc(new_entries);
if (!new_hash)
......@@ -351,10 +359,13 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
int key_len = tbl->key_len;
u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
NEIGH_CACHE_STAT_INC(tbl, lookups);
read_lock_bh(&tbl->lock);
for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
neigh_hold(n);
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
......@@ -368,10 +379,13 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey)
int key_len = tbl->key_len;
u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask;
NEIGH_CACHE_STAT_INC(tbl, lookups);
read_lock_bh(&tbl->lock);
for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
if (!memcmp(n->primary_key, pkey, key_len)) {
neigh_hold(n);
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
}
}
......@@ -556,6 +570,8 @@ void neigh_destroy(struct neighbour *neigh)
{
struct hh_cache *hh;
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
printk(KERN_WARNING
"Destroying alive neighbour %p\n", neigh);
......@@ -631,6 +647,8 @@ static void neigh_periodic_timer(unsigned long arg)
struct neighbour *n, **np;
unsigned long expire, now = jiffies;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
write_lock(&tbl->lock);
/*
......@@ -762,7 +780,7 @@ static void neigh_timer_handler(unsigned long arg)
neigh->nud_state = NUD_FAILED;
notify = 1;
neigh->tbl->stats.res_failed++;
NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed);
NEIGH_PRINTK2("neigh %p is failed.\n", neigh);
/* It is very thin place. report_unreachable is very complicated
......@@ -1311,6 +1329,29 @@ void neigh_table_init(struct neigh_table *tbl)
if (!tbl->kmem_cachep)
panic("cannot create neighbour cache");
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
#ifdef CONFIG_PROC_FS
#define NC_STAT_SUFFIX "_stat"
{
char *proc_stat_name;
proc_stat_name = kmalloc(strlen(tbl->id) +
strlen(NC_STAT_SUFFIX) + 1, GFP_KERNEL);
if (!proc_stat_name)
panic("cannot allocate neighbour cache proc name buffer");
strcpy(proc_stat_name, tbl->id);
strcat(proc_stat_name, NC_STAT_SUFFIX);
tbl->pde = create_proc_entry(proc_stat_name, 0, proc_net);
if (!tbl->pde)
panic("cannot create neighbour proc dir entry");
tbl->pde->proc_fops = &neigh_stat_seq_fops;
tbl->pde->data = tbl;
}
#endif
tbl->hash_mask = 1;
tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
......@@ -1857,6 +1898,106 @@ void neigh_seq_stop(struct seq_file *seq, void *v)
}
EXPORT_SYMBOL(neigh_seq_stop);
/* statistics via seq_file */
static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
{
struct proc_dir_entry *pde = seq->private;
struct neigh_table *tbl = pde->data;
int cpu;
if (*pos == 0)
return SEQ_START_TOKEN;
for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
return per_cpu_ptr(tbl->stats, cpu);
}
return NULL;
}
static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct proc_dir_entry *pde = seq->private;
struct neigh_table *tbl = pde->data;
int cpu;
for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
if (!cpu_possible(cpu))
continue;
*pos = cpu+1;
return per_cpu_ptr(tbl->stats, cpu);
}
return NULL;
}
static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
{
}
static int neigh_stat_seq_show(struct seq_file *seq, void *v)
{
struct proc_dir_entry *pde = seq->private;
struct neigh_table *tbl = pde->data;
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs forced_gc_goal_miss\n");
return 0;
}
seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
"%08lx %08lx %08lx %08lx\n",
tbl->entries,
st->allocs,
st->destroys,
st->hash_grows,
st->lookups,
st->hits,
st->res_failed,
st->rcv_probes_mcast,
st->rcv_probes_ucast,
st->periodic_gc_runs,
st->forced_gc_runs
);
return 0;
}
static struct seq_operations neigh_stat_seq_ops = {
.start = neigh_stat_seq_start,
.next = neigh_stat_seq_next,
.stop = neigh_stat_seq_stop,
.show = neigh_stat_seq_show,
};
static int neigh_stat_seq_open(struct inode *inode, struct file *file)
{
int ret = seq_open(file, &neigh_stat_seq_ops);
if (!ret) {
struct seq_file *sf = file->private_data;
sf->private = PDE(inode);
}
return ret;
};
static struct file_operations neigh_stat_seq_fops = {
.owner = THIS_MODULE,
.open = neigh_stat_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_ARPD
......
......@@ -802,10 +802,10 @@ __u32 tcp_init_cwnd(struct tcp_opt *tp, struct dst_entry *dst)
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd) {
if (tp->mss_cache > 1460)
if (tp->mss_cache_std > 1460)
cwnd = 2;
else
cwnd = (tp->mss_cache > 1095) ? 3 : 4;
cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
}
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
......@@ -2355,6 +2355,86 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
}
}
/* There is one downside to this scheme. Although we keep the
* ACK clock ticking, adjusting packet counters and advancing
* congestion window, we do not liberate socket send buffer
* space.
*
* Mucking with skb->truesize and sk->sk_wmem_alloc et al.
* then making a write space wakeup callback is a possible
* future enhancement. WARNING: it is not trivial to make.
*/
static int tcp_tso_acked(struct tcp_opt *tp, struct sk_buff *skb,
__u32 now, __s32 *seq_rtt)
{
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
__u32 mss = scb->tso_mss;
__u32 snd_una = tp->snd_una;
__u32 seq = scb->seq;
__u32 packets_acked = 0;
int acked = 0;
/* If we get here, the whole TSO packet has not been
* acked.
*/
BUG_ON(!after(scb->end_seq, snd_una));
while (!after(seq + mss, snd_una)) {
packets_acked++;
seq += mss;
}
if (packets_acked) {
__u8 sacked = scb->sacked;
/* We adjust scb->seq but we do not pskb_pull() the
* SKB. We let tcp_retransmit_skb() handle this case
* by checking skb->len against the data sequence span.
* This way, we avoid the pskb_pull() work unless we
* actually need to retransmit the SKB.
*/
scb->seq = seq;
acked |= FLAG_DATA_ACKED;
if (sacked) {
if (sacked & TCPCB_RETRANS) {
if (sacked & TCPCB_SACKED_RETRANS)
tcp_dec_pcount_explicit(&tp->retrans_out,
packets_acked);
acked |= FLAG_RETRANS_DATA_ACKED;
*seq_rtt = -1;
} else if (*seq_rtt < 0)
*seq_rtt = now - scb->when;
if (sacked & TCPCB_SACKED_ACKED)
tcp_dec_pcount_explicit(&tp->sacked_out,
packets_acked);
if (sacked & TCPCB_LOST)
tcp_dec_pcount_explicit(&tp->lost_out,
packets_acked);
if (sacked & TCPCB_URG) {
if (tp->urg_mode &&
!before(scb->seq, tp->snd_up))
tp->urg_mode = 0;
}
} else if (*seq_rtt < 0)
*seq_rtt = now - scb->when;
if (tcp_get_pcount(&tp->fackets_out)) {
__u32 dval = min(tcp_get_pcount(&tp->fackets_out),
packets_acked);
tcp_dec_pcount_explicit(&tp->fackets_out, dval);
}
tcp_dec_pcount_explicit(&tp->packets_out, packets_acked);
scb->tso_factor -= packets_acked;
BUG_ON(scb->tso_factor == 0);
BUG_ON(!before(scb->seq, scb->end_seq));
}
return acked;
}
/* Remove acknowledged frames from the retransmission queue. */
static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
{
......@@ -2373,8 +2453,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
* discard it as it's confirmed to have arrived at
* the other end.
*/
if (after(scb->end_seq, tp->snd_una))
if (after(scb->end_seq, tp->snd_una)) {
if (scb->tso_factor > 1)
acked |= tcp_tso_acked(tp, skb,
now, &seq_rtt);
break;
}
/* Initial outgoing SYN's get put onto the write_queue
* just like anything else we transmit. It is not
......
......@@ -436,6 +436,7 @@ void tcp_set_skb_tso_factor(struct sk_buff *skb, unsigned int mss_std)
factor /= mss_std;
TCP_SKB_CB(skb)->tso_factor = factor;
}
TCP_SKB_CB(skb)->tso_mss = mss_std;
}
/* Function to create two new TCP segments. Shrinks the given segment
......@@ -552,7 +553,7 @@ unsigned char * __pskb_trim_head(struct sk_buff *skb, int len)
return skb->tail;
}
static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
static int __tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
if (skb_cloned(skb) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
......@@ -565,11 +566,20 @@ static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
return -ENOMEM;
}
TCP_SKB_CB(skb)->seq += len;
skb->ip_summed = CHECKSUM_HW;
return 0;
}
static inline int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
int err = __tcp_trim_head(sk, skb, len);
if (!err)
TCP_SKB_CB(skb)->seq += len;
return err;
}
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
......@@ -949,6 +959,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_opt *tp = tcp_sk(sk);
unsigned int cur_mss = tcp_current_mss(sk, 0);
__u32 data_seq, data_end_seq;
int err;
/* Do not sent more than we queued. 1/4 is reserved for possible
......@@ -958,6 +969,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
return -EAGAIN;
/* What is going on here? When TSO packets are partially ACK'd,
* we adjust the TCP_SKB_CB(skb)->seq value forward but we do
* not adjust the data area of the SKB. We defer that to here
* so that we can avoid the work unless we really retransmit
* the packet.
*/
data_seq = TCP_SKB_CB(skb)->seq;
data_end_seq = TCP_SKB_CB(skb)->end_seq;
if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
data_end_seq--;
if (skb->len > (data_end_seq - data_seq)) {
u32 to_trim = skb->len - (data_end_seq - data_seq);
if (__tcp_trim_head(sk, skb, to_trim))
return -ENOMEM;
}
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
BUG();
......@@ -1191,6 +1220,7 @@ void tcp_send_fin(struct sock *sk)
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->tso_factor = 1;
TCP_SKB_CB(skb)->tso_mss = tp->mss_cache_std;
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
TCP_SKB_CB(skb)->seq = tp->write_seq;
......@@ -1223,6 +1253,7 @@ void tcp_send_active_reset(struct sock *sk, int priority)
TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->tso_factor = 1;
TCP_SKB_CB(skb)->tso_mss = tp->mss_cache_std;
/* Send it off. */
TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
......@@ -1304,6 +1335,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->tso_factor = 1;
TCP_SKB_CB(skb)->tso_mss = tp->mss_cache_std;
th->seq = htonl(TCP_SKB_CB(skb)->seq);
th->ack_seq = htonl(req->rcv_isn + 1);
if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
......@@ -1406,6 +1438,7 @@ int tcp_connect(struct sock *sk)
TCP_ECN_send_syn(sk, tp, buff);
TCP_SKB_CB(buff)->sacked = 0;
TCP_SKB_CB(buff)->tso_factor = 1;
TCP_SKB_CB(buff)->tso_mss = tp->mss_cache_std;
buff->csum = 0;
TCP_SKB_CB(buff)->seq = tp->write_seq++;
TCP_SKB_CB(buff)->end_seq = tp->write_seq;
......@@ -1506,6 +1539,7 @@ void tcp_send_ack(struct sock *sk)
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(buff)->sacked = 0;
TCP_SKB_CB(buff)->tso_factor = 1;
TCP_SKB_CB(buff)->tso_mss = tp->mss_cache_std;
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
......@@ -1541,6 +1575,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(skb)->sacked = urgent;
TCP_SKB_CB(skb)->tso_factor = 1;
TCP_SKB_CB(skb)->tso_mss = tp->mss_cache_std;
/* Use a previous sequence. This should cause the other
* end to send an ack. Don't queue or clone SKB, just
......
......@@ -802,9 +802,9 @@ static void ndisc_recv_ns(struct sk_buff *skb)
}
if (inc)
nd_tbl.stats.rcv_probes_mcast++;
NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast);
else
nd_tbl.stats.rcv_probes_ucast++;
NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast);
/*
* update / create cache entry
......
......@@ -1220,7 +1220,6 @@ MODULE_ALIAS_NETPROTO(PF_NETLINK);
EXPORT_SYMBOL(netlink_ack);
EXPORT_SYMBOL(netlink_broadcast);
EXPORT_SYMBOL(netlink_broadcast_deliver);
EXPORT_SYMBOL(netlink_dump_start);
EXPORT_SYMBOL(netlink_kernel_create);
EXPORT_SYMBOL(netlink_register_notifier);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment