android_kernel_motorola_sm6225/net/ipv4/inet_fragment.c
Nikolay Aleksandrov 24b9bf43e9 net: fix for a race condition in the inet frag code
I stumbled upon this very serious bug while hunting for another one,
it's a very subtle race condition between inet_frag_evictor,
inet_frag_intern and the IPv4/6 frag_queue and expire functions
(basically the users of inet_frag_kill/inet_frag_put).

What happens is that after a fragment has been added to the hash chain
but before it's been added to the lru_list (inet_frag_lru_add) in
inet_frag_intern, it may get deleted (either by an expired timer if
the system load is high or the timer sufficiently low, or by the
fraq_queue function for different reasons) before it's added to the
lru_list, then after it gets added it's a matter of time for the
evictor to get to a piece of memory which has been freed leading to a
number of different bugs depending on what's left there.

I've been able to trigger this on both IPv4 and IPv6 (which is normal
as the frag code is the same), but it's been much more difficult to
trigger on IPv4 due to the protocol differences about how fragments
are treated.

The setup I used to reproduce this is: 2 machines with 4 x 10G bonded
in a RR bond, so the same flow can be seen on multiple cards at the
same time. Then I used multiple instances of ping/ping6 to generate
fragmented packets and flood the machines with them while running
other processes to load the attacked machine.

*It is very important to have the _same flow_ coming in on multiple CPUs
concurrently. Usually the attacked machine would die in less than 30
minutes, if configured properly to have many evictor calls and timeouts
it could happen in 10 minutes or so.

An important point to make is that any caller (frag_queue or timer) of
inet_frag_kill will remove both the timer refcount and the
original/guarding refcount thus removing everything that's keeping the
frag from being freed at the next inet_frag_put.  All of this could
happen before the frag was ever added to the LRU list, then it gets
added and the evictor uses a freed fragment.

An example for IPv6 would be if a fragment is being added and is at
the stage of being inserted in the hash after the hash lock is
released, but before inet_frag_lru_add executes (or is able to obtain
the lru lock) another overlapping fragment for the same flow arrives
at a different CPU which finds it in the hash, but since it's
overlapping it drops it invoking inet_frag_kill and thus removing all
guarding refcounts, and afterwards freeing it by invoking
inet_frag_put which removes the last refcount added previously by
inet_frag_find, then inet_frag_lru_add gets executed by
inet_frag_intern and we have a freed fragment in the lru_list.

The fix is simple, just move the lru_add under the hash chain locked
region so when a removing function is called it'll have to wait for
the fragment to be added to the lru_list, and then it'll remove it (it
works because the hash chain removal is done before the lru_list one
and there's no window between the two list adds when the frag can get
dropped). With this fix applied I couldn't kill the same machine in 24
hours with the same setup.

Fixes: 3ef0eb0db4 ("net: frag, move LRU list maintenance outside of
rwlock")

CC: Florian Westphal <fw@strlen.de>
CC: Jesper Dangaard Brouer <brouer@redhat.com>
CC: David S. Miller <davem@davemloft.net>

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-05 20:31:42 -05:00

361 lines
8.6 KiB
C

/*
* inet fragments management
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Pavel Emelyanov <xemul@openvz.org>
* Started as consolidation of ipv4/ip_fragment.c,
* ipv6/reassembly. and ipv6 nf conntrack reassembly
*/
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/inet_frag.h>
#include <net/inet_ecn.h>
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
* Value : 0xff if frame should be dropped.
* 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
*/
const u8 ip_frag_ecn_table[16] = {
/* at least one fragment had CE, and others ECT_0 or ECT_1 */
[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
/* invalid combinations : drop frame */
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
};
EXPORT_SYMBOL(ip_frag_ecn_table);
static void inet_frag_secret_rebuild(unsigned long dummy)
{
struct inet_frags *f = (struct inet_frags *)dummy;
unsigned long now = jiffies;
int i;
/* Per bucket lock NOT needed here, due to write lock protection */
write_lock(&f->lock);
get_random_bytes(&f->rnd, sizeof(u32));
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
struct hlist_node *n;
hb = &f->hash[i];
hlist_for_each_entry_safe(q, n, &hb->chain, list) {
unsigned int hval = f->hashfn(q);
if (hval != i) {
struct inet_frag_bucket *hb_dest;
hlist_del(&q->list);
/* Relink to new hash chain. */
hb_dest = &f->hash[hval];
hlist_add_head(&q->list, &hb_dest->chain);
}
}
}
write_unlock(&f->lock);
mod_timer(&f->secret_timer, now + f->secret_interval);
}
void inet_frags_init(struct inet_frags *f)
{
int i;
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
struct inet_frag_bucket *hb = &f->hash[i];
spin_lock_init(&hb->chain_lock);
INIT_HLIST_HEAD(&hb->chain);
}
rwlock_init(&f->lock);
setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
(unsigned long)f);
f->secret_timer.expires = jiffies + f->secret_interval;
add_timer(&f->secret_timer);
}
EXPORT_SYMBOL(inet_frags_init);
void inet_frags_init_net(struct netns_frags *nf)
{
nf->nqueues = 0;
init_frag_mem_limit(nf);
INIT_LIST_HEAD(&nf->lru_list);
spin_lock_init(&nf->lru_lock);
}
EXPORT_SYMBOL(inet_frags_init_net);
void inet_frags_fini(struct inet_frags *f)
{
del_timer(&f->secret_timer);
}
EXPORT_SYMBOL(inet_frags_fini);
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
{
nf->low_thresh = 0;
local_bh_disable();
inet_frag_evictor(nf, f, true);
local_bh_enable();
percpu_counter_destroy(&nf->mem);
}
EXPORT_SYMBOL(inet_frags_exit_net);
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
{
struct inet_frag_bucket *hb;
unsigned int hash;
read_lock(&f->lock);
hash = f->hashfn(fq);
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
hlist_del(&fq->list);
spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
inet_frag_lru_del(fq);
}
void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
{
if (del_timer(&fq->timer))
atomic_dec(&fq->refcnt);
if (!(fq->last_in & INET_FRAG_COMPLETE)) {
fq_unlink(fq, f);
atomic_dec(&fq->refcnt);
fq->last_in |= INET_FRAG_COMPLETE;
}
}
EXPORT_SYMBOL(inet_frag_kill);
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
struct sk_buff *skb)
{
if (f->skb_free)
f->skb_free(skb);
kfree_skb(skb);
}
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
int *work)
{
struct sk_buff *fp;
struct netns_frags *nf;
unsigned int sum, sum_truesize = 0;
WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
WARN_ON(del_timer(&q->timer) != 0);
/* Release all fragment data. */
fp = q->fragments;
nf = q->net;
while (fp) {
struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize;
frag_kfree_skb(nf, f, fp);
fp = xp;
}
sum = sum_truesize + f->qsize;
if (work)
*work -= sum;
sub_frag_mem_limit(q, sum);
if (f->destructor)
f->destructor(q);
kfree(q);
}
EXPORT_SYMBOL(inet_frag_destroy);
int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
{
struct inet_frag_queue *q;
int work, evicted = 0;
if (!force) {
if (frag_mem_limit(nf) <= nf->high_thresh)
return 0;
}
work = frag_mem_limit(nf) - nf->low_thresh;
while (work > 0) {
spin_lock(&nf->lru_lock);
if (list_empty(&nf->lru_list)) {
spin_unlock(&nf->lru_lock);
break;
}
q = list_first_entry(&nf->lru_list,
struct inet_frag_queue, lru_list);
atomic_inc(&q->refcnt);
/* Remove q from list to avoid several CPUs grabbing it */
list_del_init(&q->lru_list);
spin_unlock(&nf->lru_lock);
spin_lock(&q->lock);
if (!(q->last_in & INET_FRAG_COMPLETE))
inet_frag_kill(q, f);
spin_unlock(&q->lock);
if (atomic_dec_and_test(&q->refcnt))
inet_frag_destroy(q, f, &work);
evicted++;
}
return evicted;
}
EXPORT_SYMBOL(inet_frag_evictor);
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
struct inet_frag_queue *qp_in, struct inet_frags *f,
void *arg)
{
struct inet_frag_bucket *hb;
struct inet_frag_queue *qp;
unsigned int hash;
read_lock(&f->lock); /* Protects against hash rebuild */
/*
* While we stayed w/o the lock other CPU could update
* the rnd seed, so we need to re-calculate the hash
* chain. Fortunatelly the qp_in can be used to get one.
*/
hash = f->hashfn(qp_in);
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
#ifdef CONFIG_SMP
/* With SMP race we have to recheck hash table, because
* such entry could be created on other cpu, while we
* released the hash bucket lock.
*/
hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
atomic_inc(&qp->refcnt);
spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
qp_in->last_in |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
return qp;
}
}
#endif
qp = qp_in;
if (!mod_timer(&qp->timer, jiffies + nf->timeout))
atomic_inc(&qp->refcnt);
atomic_inc(&qp->refcnt);
hlist_add_head(&qp->list, &hb->chain);
inet_frag_lru_add(nf, qp);
spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
return qp;
}
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
struct inet_frags *f, void *arg)
{
struct inet_frag_queue *q;
q = kzalloc(f->qsize, GFP_ATOMIC);
if (q == NULL)
return NULL;
q->net = nf;
f->constructor(q, arg);
add_frag_mem_limit(q, f->qsize);
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
INIT_LIST_HEAD(&q->lru_list);
return q;
}
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
struct inet_frags *f, void *arg)
{
struct inet_frag_queue *q;
q = inet_frag_alloc(nf, f, arg);
if (q == NULL)
return NULL;
return inet_frag_intern(nf, q, f, arg);
}
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash)
__releases(&f->lock)
{
struct inet_frag_bucket *hb;
struct inet_frag_queue *q;
int depth = 0;
hb = &f->hash[hash];
spin_lock(&hb->chain_lock);
hlist_for_each_entry(q, &hb->chain, list) {
if (q->net == nf && f->match(q, key)) {
atomic_inc(&q->refcnt);
spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
return q;
}
depth++;
}
spin_unlock(&hb->chain_lock);
read_unlock(&f->lock);
if (depth <= INETFRAGS_MAXDEPTH)
return inet_frag_create(nf, f, key);
else
return ERR_PTR(-ENOBUFS);
}
EXPORT_SYMBOL(inet_frag_find);
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
const char *prefix)
{
static const char msg[] = "inet_frag_find: Fragment hash bucket"
" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
". Dropping fragment.\n";
if (PTR_ERR(q) == -ENOBUFS)
LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
}
EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);