net: alloc_pskb

tun and virtio_net both allocate large skbs.  Begin formalization.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/skbuff.h |    5 +
 net/core/skbuff.c      |  135 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+)

diff -r eb74b7bb117b include/linux/skbuff.h
--- a/include/linux/skbuff.h	Tue Oct 21 08:57:56 2008 +1100
+++ b/include/linux/skbuff.h	Tue Oct 21 09:01:09 2008 +1100
@@ -376,6 +376,11 @@ static inline struct sk_buff *alloc_skb(
 {
 	return __alloc_skb(size, priority, 0, -1);
 }
+
+extern struct sk_buff *alloc_pskb(unsigned int linear, unsigned int nonlinear,
+				  gfp_t priority);
+
+extern void trim_alloced_pskb(struct sk_buff *, unsigned int totlen);
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 					       gfp_t priority)
diff -r eb74b7bb117b net/core/skbuff.c
--- a/net/core/skbuff.c	Tue Oct 21 08:57:56 2008 +1100
+++ b/net/core/skbuff.c	Tue Oct 21 09:01:09 2008 +1100
@@ -69,6 +69,9 @@
 
 static struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+static DEFINE_SPINLOCK(pskb_pages_lock);
+static unsigned int pskb_pages_count;
+static struct page *pskb_pages_cache;
 
 static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
 				  struct pipe_buffer *buf)
@@ -235,6 +238,138 @@ nodata:
 	skb = NULL;
 	goto out;
 }
+
+static void give_pskb_page(struct page *p)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pskb_pages_lock, flags);
+	p->private = (unsigned long)pskb_pages_cache;
+	pskb_pages_cache = p;
+	pskb_pages_count++;
+	spin_unlock_irqrestore(&pskb_pages_lock, flags);
+}
+
+static struct page *get_pskb_page(gfp_t priority)
+{
+	unsigned long flags;
+	struct page *p;
+
+	spin_lock_irqsave(&pskb_pages_lock, flags);
+	p = pskb_pages_cache;
+	if (p) {
+		pskb_pages_cache = (struct page *)(p->private);
+		pskb_pages_count--;
+	}
+	spin_unlock_irqrestore(&pskb_pages_lock, flags);
+
+	if (!p)
+		p = alloc_page(priority);
+	return p;
+}
+
+/**
+ * alloc_pskb - allocate and skb_put a paged skb
+ * @linear: bytes of linear data
+ * @nonlinear: bytes of nonlinear data
+ * @priority: GFP_KERNEL/GFP_ATOMIC etc.
+ *
+ * This allocates an skb with frags, and sets up skb->data_len: this
+ * allows you to allocate large (nonlinear) skbs, because allocating
+ * large linear skbs requires contiguous memory and is quite likely to
+ * fail.
+ *
+ * Because page allocation is slow, it's recommended that
+ * trim_alloced_pskb() is called on the return skb if it is only
+ * partially used.  This will return any unused pages to the internal
+ * cache.
+ */
+struct sk_buff *alloc_pskb(unsigned int linear, unsigned int nonlinear,
+			   gfp_t priority)
+{
+	struct sk_buff *skb = alloc_skb(linear, priority);
+	unsigned int i;
+
+	if (unlikely(!skb))
+		return NULL;
+
+	skb_put(skb, linear);
+	BUG_ON(nonlinear > MAX_SKB_FRAGS * PAGE_SIZE);
+
+	for (i = 0; i < MAX_SKB_FRAGS; i++) {
+		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+		f->page = get_pskb_page(priority);
+		if (unlikely(!f->page)) {
+			kfree_skb(skb);
+			return NULL;
+		}
+
+		f->page_offset = 0;
+		f->size = PAGE_SIZE;
+
+		skb->data_len += PAGE_SIZE;
+		skb->len += PAGE_SIZE;
+		skb->truesize += PAGE_SIZE;
+
+		skb_shinfo(skb)->nr_frags++;
+		if (skb->len >= linear + nonlinear)
+			break;
+	}
+	return skb;
+}
+
+/**
+ * trim_alloced_skb - trim an alloc_pskb-allocated skb to length
+ * @skb: the skb
+ * @totlen: the total length (linear and nonlinear)
+ *
+ * This returns any unused pages from the skb frags to the internal
+ * cache, and sets skb->data_len etc. appropriately.  This must be done
+ * before the packet is cloned or otherwise referenced.
+ */
+void trim_alloced_pskb(struct sk_buff *skb, unsigned int totlen)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	while (skb->len + PAGE_SIZE >= totlen && shinfo->nr_frags) {
+		give_pskb_page(shinfo->frags[--shinfo->nr_frags].page);
+		skb->data_len -= PAGE_SIZE;
+		skb->len -= PAGE_SIZE;
+		skb->truesize -= PAGE_SIZE;
+	}
+	pskb_trim_unique(skb, totlen);
+}
+
+static int shrink_alloc_pskb_pages(int nr_to_scan, gfp_t gfp_mask)
+{
+	printk("Shrinker says to drop %i pages (have %i)\n", nr_to_scan,
+	       pskb_pages_count);
+	/* We get called with 0 a fair amount: don't even bother locking. */
+	if (nr_to_scan) {
+		spin_lock_irq(&pskb_pages_lock);
+		while (nr_to_scan && pskb_pages_count) {
+			struct page *p = pskb_pages_cache;
+			pskb_pages_cache = (struct page *)(p->private);
+			pskb_pages_count--;
+			__free_page(p);
+		}
+		printk("Now %i pages left\n", pskb_pages_count);
+		spin_unlock_irq(&pskb_pages_lock);
+	}
+	return pskb_pages_count;
+}
+
+static struct shrinker alloc_pskb_shrinker = {
+	.shrink = shrink_alloc_pskb_pages,
+	.seeks = DEFAULT_SEEKS,
+};
+
+static int setup_shrinker(void)
+{
+	register_shrinker(&alloc_pskb_shrinker);
+	return 0;
+}
+core_initcall(setup_shrinker);
 
 /**
  *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
