Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block

* 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block: (113 commits) cfq-iosched: Do not access cfqq after freeing it block: include linux/err.h to use ERR_PTR cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit blkio: Allow CFQ group IO scheduling even when CFQ is a module blkio: Implement dynamic io controlling policy registration blkio: Export some symbols from blkio as its user CFQ can be a module block: Fix io_context leak after failure of clone with CLONE_IO block: Fix io_context leak after clone with CLONE_IO cfq-iosched: make nonrot check logic consistent io controller: quick fix for blk-cgroup and modular CFQ cfq-iosched: move IO controller declerations to a header file cfq-iosched: fix compile problem with !CONFIG_CGROUP blkio: Documentation blkio: Wait on sync-noidle queue even if rq_noidle = 1 blkio: Implement group_isolation tunable blkio: Determine async workload length based on total number of queues blkio: Wait for cfq queue to get backlogged if group is empty blkio: Propagate cgroup weight updation to cfq groups blkio: Drop the reference to queue once the task changes cgroup blkio: Provide some isolation between groups ...

Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block: (113 commits) cfq-iosched: Do not access cfqq after freeing it block: include linux/err.h to use ERR_PTR cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit blkio: Allow CFQ group IO scheduling even when CFQ is a module blkio: Implement dynamic io controlling policy registration blkio: Export some symbols from blkio as its user CFQ can be a module block: Fix io_context leak after failure of clone with CLONE_IO block: Fix io_context leak after clone with CLONE_IO cfq-iosched: make nonrot check logic consistent io controller: quick fix for blk-cgroup and modular CFQ cfq-iosched: move IO controller declerations to a header file cfq-iosched: fix compile problem with !CONFIG_CGROUP blkio: Documentation blkio: Wait on sync-noidle queue even if rq_noidle = 1 blkio: Implement group_isolation tunable blkio: Determine async workload length based on total number of queues blkio: Wait for cfq queue to get backlogged if group is empty blkio: Propagate cgroup weight updation to cfq groups blkio: Drop the reference to queue once the task changes cgroup blkio: Provide some isolation between groups ...
6035ccd8 · Linus Torvalds · 23eb3b64 · 878eaddd · 6035ccd8 · 6035ccd8
Commit 6035ccd8 authored Dec 08, 2009 by Linus Torvalds
107 changed files
--- a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
+++ b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
--- a/Documentation/blockdev/drbd/DRBD-data-packets.svg
+++ b/Documentation/blockdev/drbd/DRBD-data-packets.svg
--- a/Documentation/blockdev/drbd/README.txt
+++ b/Documentation/blockdev/drbd/README.txt
+Description
+
+  DRBD is a shared-nothing, synchronously replicated block device. It
+  is designed to serve as a building block for high availability
+  clusters and in this context, is a "drop-in" replacement for shared
+  storage. Simplistically, you could see it as a network RAID 1.
+
+  Please visit http://www.drbd.org to find out more.
+
+The here included files are intended to help understand the implementation
+
+DRBD-8.3-data-packets.svg, DRBD-data-packets.svg  
+  relates some functions, and write packets.
+
+conn-states-8.dot, disk-states-8.dot, node-states-8.dot
+  The sub graphs of DRBD's state transitions
--- a/Documentation/blockdev/drbd/conn-states-8.dot
+++ b/Documentation/blockdev/drbd/conn-states-8.dot
+digraph conn_states {
+	StandAllone  -> WFConnection   [ label = "ioctl_set_net()" ]
+	WFConnection -> Unconnected    [ label = "unable to bind()" ]
+	WFConnection -> WFReportParams [ label = "in connect() after accept" ]
+	WFReportParams -> StandAllone  [ label = "checks in receive_param()" ]
+	WFReportParams -> Connected    [ label = "in receive_param()" ]
+	WFReportParams -> WFBitMapS    [ label = "sync_handshake()" ]
+	WFReportParams -> WFBitMapT    [ label = "sync_handshake()" ]
+	WFBitMapS -> SyncSource        [ label = "receive_bitmap()" ]
+	WFBitMapT -> SyncTarget        [ label = "receive_bitmap()" ]
+	SyncSource -> Connected
+	SyncTarget -> Connected
+	SyncSource -> PausedSyncS
+	SyncTarget -> PausedSyncT
+	PausedSyncS -> SyncSource
+	PausedSyncT -> SyncTarget
+	Connected   -> WFConnection    [ label = "* on network error" ]
+}
--- a/Documentation/blockdev/drbd/disk-states-8.dot
+++ b/Documentation/blockdev/drbd/disk-states-8.dot
+digraph disk_states {
+	Diskless -> Inconsistent       [ label = "ioctl_set_disk()" ]
+	Diskless -> Consistent         [ label = "ioctl_set_disk()" ]
+	Diskless -> Outdated           [ label = "ioctl_set_disk()" ]
+	Consistent -> Outdated         [ label = "receive_param()" ]
+	Consistent -> UpToDate         [ label = "receive_param()" ]
+	Consistent -> Inconsistent     [ label = "start resync" ]
+	Outdated   -> Inconsistent     [ label = "start resync" ]
+	UpToDate   -> Inconsistent     [ label = "ioctl_replicate" ]
+	Inconsistent -> UpToDate       [ label = "resync completed" ]
+	Consistent -> Failed           [ label = "io completion error" ]
+	Outdated   -> Failed           [ label = "io completion error" ]
+	UpToDate   -> Failed           [ label = "io completion error" ]
+	Inconsistent -> Failed         [ label = "io completion error" ]
+	Failed -> Diskless             [ label = "sending notify to peer" ]
+}
--- a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
+++ b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
+// vim: set sw=2 sts=2 :
+digraph {
+  rankdir=BT
+  bgcolor=white
+
+  node [shape=plaintext]
+  node [fontcolor=black]
+
+  StandAlone     [ style=filled,fillcolor=gray,label=StandAlone ]
+
+  node [fontcolor=lightgray]
+
+  Unconnected    [ label=Unconnected ]
+
+  CommTrouble [ shape=record,
+    label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ]
+
+  node [fontcolor=gray]
+
+  subgraph cluster_try_connect {
+    label="try to connect, handshake"
+    rank=max
+    WFConnection   [ label=WFConnection ]
+    WFReportParams [ label=WFReportParams ]
+  }
+
+  TearDown       [ label=TearDown ]
+
+  Connected      [ label=Connected,style=filled,fillcolor=green,fontcolor=black ]
+
+  node [fontcolor=lightblue]
+
+  StartingSyncS  [ label=StartingSyncS ]
+  StartingSyncT  [ label=StartingSyncT ]
+
+  subgraph cluster_bitmap_exchange {
+    node [fontcolor=red]
+    fontcolor=red
+    label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged"
+
+    WFBitMapT      [ label=WFBitMapT ]
+    WFSyncUUID     [ label=WFSyncUUID ]
+    WFBitMapS      [ label=WFBitMapS ]
+  }
+
+  node [fontcolor=blue]
+
+  cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ]
+
+  node [shape=box,fontcolor=black]
+
+  // drbdadm [label="drbdadm connect"]
+  // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."]
+  // comm_error [label="communication trouble"]
+
+  //
+  // edges
+  // --------------------------------------
+
+  StandAlone -> Unconnected [ label="drbdadm connect" ]
+  Unconnected -> StandAlone  [ label="drbdadm disconnect\lor serious communication trouble" ]
+  Unconnected -> WFConnection [ label="receiver thread is started" ]
+  WFConnection -> WFReportParams [ headlabel="accept()\land/or                        \lconnect()\l" ]
+
+  WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ]
+  WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ]
+
+    WFReportParams -> WFBitMapS
+    WFReportParams -> WFBitMapT
+    WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false]
+
+      WFBitMapS -> cluster_resync:S
+      WFSyncUUID -> cluster_resync:T
+
+  edge [color=green]
+  cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ]
+
+  edge [color=red]
+  WFReportParams -> CommTrouble
+  Connected -> CommTrouble
+  cluster_resync:any -> CommTrouble
+  edge [color=black]
+  CommTrouble -> Unconnected [label="receiver thread is stopped" ]
+
+}
--- a/Documentation/blockdev/drbd/node-states-8.dot
+++ b/Documentation/blockdev/drbd/node-states-8.dot
+digraph node_states {
+	Secondary -> Primary           [ label = "ioctl_set_state()" ]
+	Primary   -> Secondary 	       [ label = "ioctl_set_state()" ]
+}
+
+digraph peer_states {
+	Secondary -> Primary           [ label = "recv state packet" ]
+	Primary   -> Secondary 	       [ label = "recv state packet" ]
+	Primary   -> Unknown 	       [ label = "connection lost" ]
+	Secondary  -> Unknown  	       [ label = "connection lost" ]
+	Unknown   -> Primary           [ label = "connected" ]
+	Unknown   -> Secondary         [ label = "connected" ]
+}
+
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
+				Block IO Controller
+				===================
+Overview
+========
+cgroup subsys "blkio" implements the block io controller. There seems to be
+a need of various kinds of IO control policies (like proportional BW, max BW)
+both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
+Plan is to use the same cgroup based management interface for blkio controller
+and based on user options switch IO policies in the background.
+
+In the first phase, this patchset implements proportional weight time based
+division of disk policy. It is implemented in CFQ. Hence this policy takes
+effect only on leaf nodes when CFQ is being used.
+
+HOWTO
+=====
+You can do a very simple testing of running two dd threads in two different
+cgroups. Here is what you can do.
+
+- Enable group scheduling in CFQ
+	CONFIG_CFQ_GROUP_IOSCHED=y
+
+- Compile and boot into kernel and mount IO controller (blkio).
+
+	mount -t cgroup -o blkio none /cgroup
+
+- Create two cgroups
+	mkdir -p /cgroup/test1/ /cgroup/test2
+
+- Set weights of group test1 and test2
+	echo 1000 > /cgroup/test1/blkio.weight
+	echo 500 > /cgroup/test2/blkio.weight
+
+- Create two same size files (say 512MB each) on same disk (file1, file2) and
+  launch two dd threads in different cgroup to read those files.
+
+	sync
+	echo 3 > /proc/sys/vm/drop_caches
+
+	dd if=/mnt/sdb/zerofile1 of=/dev/null &
+	echo $! > /cgroup/test1/tasks
+	cat /cgroup/test1/tasks
+
+	dd if=/mnt/sdb/zerofile2 of=/dev/null &
+	echo $! > /cgroup/test2/tasks
+	cat /cgroup/test2/tasks
+
+- At macro level, first dd should finish first. To get more precise data, keep
+  on looking at (with the help of script), at blkio.disk_time and
+  blkio.disk_sectors files of both test1 and test2 groups. This will tell how
+  much disk time (in milli seconds), each group got and how many secotors each
+  group dispatched to the disk. We provide fairness in terms of disk time, so
+  ideally io.disk_time of cgroups should be in proportion to the weight.
+
+Various user visible config options
+===================================
+CONFIG_CFQ_GROUP_IOSCHED
+	- Enables group scheduling in CFQ. Currently only 1 level of group
+	  creation is allowed.
+
+CONFIG_DEBUG_CFQ_IOSCHED
+	- Enables some debugging messages in blktrace. Also creates extra
+	  cgroup file blkio.dequeue.
+
+Config options selected automatically
+=====================================
+These config options are not user visible and are selected/deselected
+automatically based on IO scheduler configuration.
+
+CONFIG_BLK_CGROUP
+	- Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
+
+CONFIG_DEBUG_BLK_CGROUP
+	- Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
+
+Details of cgroup files
+=======================
+- blkio.weight
+	- Specifies per cgroup weight.
+
+	  Currently allowed range of weights is from 100 to 1000.
+
+- blkio.time
+	- disk time allocated to cgroup per device in milliseconds. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the disk time allocated to group in
+	  milliseconds.
+
+- blkio.sectors
+	- number of sectors transferred to/from disk by the group. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the number of sectors transferred by the
+	  group to/from the device.
+
+- blkio.dequeue
+	- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
+	  gives the statistics about how many a times a group was dequeued
+	  from service tree of the device. First two fields specify the major
+	  and minor number of the device and third field specifies the number
+	  of times a group was dequeued from a particular device.
+
+CFQ sysfs tunable
+=================
+/sys/block/<disk>/queue/iosched/group_isolation
+
+If group_isolation=1, it provides stronger isolation between groups at the
+expense of throughput. By default group_isolation is 0. In general that
+means that if group_isolation=0, expect fairness for sequential workload
+only. Set group_isolation=1 to see fairness for random IO workload also.
+
+Generally CFQ will put random seeky workload in sync-noidle category. CFQ
+will disable idling on these queues and it does a collective idling on group
+of such queues. Generally these are slow moving queues and if there is a
+sync-noidle service tree in each group, that group gets exclusive access to
+disk for certain period. That means it will bring the throughput down if
+group does not have enough IO to drive deeper queue depths and utilize disk
+capacity to the fullest in the slice allocated to it. But the flip side is
+that even a random reader should get better latencies and overall throughput
+if there are lots of sequential readers/sync-idle workload running in the
+system.
+
+If group_isolation=0, then CFQ automatically moves all the random seeky queues
+in the root group. That means there will be no service differentiation for
+that kind of workload. This leads to better throughput as we do collective
+idling on root sync-noidle tree.
+
+By default one should run with group_isolation=0. If that is not sufficient
+and one wants stronger isolation between groups, then set group_isolation=1
+but this will come at cost of reduced throughput.
+
+What works
+==========
+- Currently only sync IO queues are support. All the buffered writes are
+  still system wide and not per group. Hence we will not see service
+  differentiation between buffered writes between groups.
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1848,6 +1848,19 @@ S:	Maintained
 F:	drivers/scsi/dpt*
 F:	drivers/scsi/dpt/

+DRBD DRIVER
+P:     Philipp Reisner
+P:     Lars Ellenberg
+M:     drbd-dev@lists.linbit.com
+L:     drbd-user@lists.linbit.com
+W:     http://www.drbd.org
+T:     git git://git.drbd.org/linux-2.6-drbd.git drbd
+T:     git git://git.drbd.org/drbd-8.3.git
+S:     Supported
+F:     drivers/block/drbd/
+F:     lib/lru_cache.c
+F:     Documentation/blockdev/drbd/
+
 DRIVER CORE, KOBJECTS, AND SYSFS
 M:	Greg Kroah-Hartman <gregkh@suse.de>
 T:	quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/

--- a/arch/alpha/include/asm/cacheflush.h
+++ b/arch/alpha/include/asm/cacheflush.h
@@ -9,6 +9,7 @@
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)

--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -418,6 +418,7 @@ extern void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 * about to change to user space.  This is the same method as used on SPARC64.
 * See update_mmu_cache for the user space part.
 */
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *);

 static inline void __flush_icache_all(void)

--- a/arch/avr32/include/asm/cacheflush.h
+++ b/arch/avr32/include/asm/cacheflush.h
@@ -107,6 +107,7 @@ extern void flush_icache_page(struct vm_area_struct *vma, struct page *page);
 * do something here, but only for certain configurations.  No such
 * configurations exist at this time.
 */
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(page)		do { } while (0)
 #define flush_dcache_mmap_unlock(page)		do { } while (0)

--- a/arch/blackfin/include/asm/cacheflush.h
+++ b/arch/blackfin/include/asm/cacheflush.h
@@ -68,9 +68,11 @@ do { memcpy(dst, src, len);						\
 #endif
 #if defined(CONFIG_BFIN_EXTMEM_WRITEBACK) || defined(CONFIG_BFIN_L2_WRITEBACK)
 # define flush_dcache_range(start,end)		blackfin_dcache_flush_range((start), (end))
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 # define flush_dcache_page(page)		blackfin_dflush_page(page_address(page))
 #else
 # define flush_dcache_range(start,end)		do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 # define flush_dcache_page(page)		do { } while (0)
 #endif


--- a/arch/cris/include/asm/cacheflush.h
+++ b/arch/cris/include/asm/cacheflush.h
@@ -12,6 +12,7 @@
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)

--- a/arch/frv/include/asm/cacheflush.h
+++ b/arch/frv/include/asm/cacheflush.h
@@ -47,6 +47,7 @@ static inline void __flush_cache_all(void)
 }

 /* dcache/icache coherency... */
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #ifdef CONFIG_MMU
 extern void flush_dcache_page(struct page *page);
 #else

--- a/arch/h8300/include/asm/cacheflush.h
+++ b/arch/h8300/include/asm/cacheflush.h
@@ -15,6 +15,7 @@
 #define	flush_cache_dup_mm(mm)		do { } while (0)
 #define	flush_cache_range(vma,a,b)
 #define	flush_cache_page(vma,p,pfn)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define	flush_dcache_page(page)
 #define	flush_dcache_mmap_lock(mapping)
 #define	flush_dcache_mmap_unlock(mapping)

--- a/arch/ia64/include/asm/cacheflush.h
+++ b/arch/ia64/include/asm/cacheflush.h
@@ -25,6 +25,7 @@
 #define flush_cache_vmap(start, end)		do { } while (0)
 #define flush_cache_vunmap(start, end)		do { } while (0)

+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)			\
 do {						\
 	clear_bit(PG_arch_1, &(page)->flags);	\

--- a/arch/m32r/include/asm/cacheflush.h
+++ b/arch/m32r/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ extern void _flush_cache_copyback_all(void);
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
@@ -33,6 +34,7 @@ extern void smp_flush_cache_all(void);
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
@@ -46,6 +48,7 @@ extern void smp_flush_cache_all(void);
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)

--- a/arch/m68k/include/asm/cacheflush_mm.h
+++ b/arch/m68k/include/asm/cacheflush_mm.h
@@ -128,6 +128,7 @@ static inline void __flush_page_to_ram(void *vaddr)
 	}
 }

+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)		__flush_page_to_ram(page_address(page))
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)

--- a/arch/m68k/include/asm/cacheflush_no.h
+++ b/arch/m68k/include/asm/cacheflush_no.h
@@ -12,6 +12,7 @@
 #define flush_cache_range(vma, start, end)	__flush_cache_all()
 #define flush_cache_page(vma, vmaddr)		do { } while (0)
 #define flush_dcache_range(start,len)		__flush_cache_all()
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)

--- a/arch/microblaze/include/asm/cacheflush.h
+++ b/arch/microblaze/include/asm/cacheflush.h
@@ -37,6 +37,7 @@
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)

 #define flush_dcache_range(start, end)	__invalidate_dcache_range(start, end)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)

--- a/arch/mips/include/asm/cacheflush.h
+++ b/arch/mips/include/asm/cacheflush.h
@@ -38,6 +38,7 @@ extern void (*flush_cache_range)(struct vm_area_struct *vma,
 extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn);
 extern void __flush_dcache_page(struct page *page);

+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 static inline void flush_dcache_page(struct page *page)
 {
 	if (cpu_has_dc_aliases || !cpu_has_ic_fills_f_dc)

--- a/arch/mn10300/include/asm/cacheflush.h
+++ b/arch/mn10300/include/asm/cacheflush.h
@@ -26,6 +26,7 @@
 #define flush_cache_page(vma, vmaddr, pfn)	do {} while (0)
 #define flush_cache_vmap(start, end)		do {} while (0)
 #define flush_cache_vunmap(start, end)		do {} while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do {} while (0)
 #define flush_dcache_mmap_lock(mapping)		do {} while (0)
 #define flush_dcache_mmap_unlock(mapping)	do {} while (0)

--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -42,6 +42,7 @@ void flush_cache_mm(struct mm_struct *mm);
 #define flush_cache_vmap(start, end)		flush_cache_all()
 #define flush_cache_vunmap(start, end)		flush_cache_all()

+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);

 #define flush_dcache_mmap_lock(mapping) \

--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -25,6 +25,7 @@
 #define flush_cache_vmap(start, end)		do { } while (0)
 #define flush_cache_vunmap(start, end)		do { } while (0)

+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)

--- a/arch/s390/include/asm/cacheflush.h
+++ b/arch/s390/include/asm/cacheflush.h
@@ -10,6 +10,7 @@
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)

--- a/arch/score/include/asm/cacheflush.h
+++ b/arch/score/include/asm/cacheflush.h
@@ -16,6 +16,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void flush_dcache_range(unsigned long start, unsigned long end);

 #define flush_cache_dup_mm(mm)			do {} while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do {} while (0)
 #define flush_dcache_mmap_lock(mapping)		do {} while (0)
 #define flush_dcache_mmap_unlock(mapping)	do {} while (0)

--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -42,6 +42,7 @@ extern void flush_cache_page(struct vm_area_struct *vma,
 				unsigned long addr, unsigned long pfn);
 extern void flush_cache_range(struct vm_area_struct *vma,
 				 unsigned long start, unsigned long end);
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void flush_icache_page(struct vm_area_struct *vma,

--- a/arch/sparc/include/asm/cacheflush_32.h
+++ b/arch/sparc/include/asm/cacheflush_32.h
@@ -75,6 +75,7 @@ BTFIXUPDEF_CALL(void, flush_sig_insns, struct mm_struct *, unsigned long)

 extern void sparc_flush_page_to_ram(struct page *page);

+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)			sparc_flush_page_to_ram(page)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)

--- a/arch/sparc/include/asm/cacheflush_64.h
+++ b/arch/sparc/include/asm/cacheflush_64.h
@@ -37,6 +37,7 @@ extern void flush_dcache_page_all(struct mm_struct *mm, struct page *page);
 #endif

 extern void __flush_dcache_range(unsigned long start, unsigned long end);
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);

 #define flush_icache_page(vma, pg)	do { } while(0)

--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end) { }
 static inline void flush_cache_page(struct vm_area_struct *vma,
 				    unsigned long vmaddr, unsigned long pfn) { }
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 static inline void flush_dcache_page(struct page *page) { }
 static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
 static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }

--- a/arch/xtensa/include/asm/cacheflush.h
+++ b/arch/xtensa/include/asm/cacheflush.h
@@ -101,6 +101,7 @@ static inline void __invalidate_icache_page_alias(unsigned long virt,
 #define flush_cache_vmap(start,end)	flush_cache_all()
 #define flush_cache_vunmap(start,end)	flush_cache_all()

+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page*);
 extern void flush_cache_range(struct vm_area_struct*, ulong, ulong);
 extern void flush_cache_page(struct vm_area_struct*, unsigned long, unsigned long);

--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.

+config BLK_CGROUP
+	bool
+	depends on CGROUPS
+	default n
+	---help---
+	Generic block IO controller cgroup interface. This is the common
+	cgroup interface which should be used by various IO controlling
+	policies.
+
+	Currently, CFQ IO scheduler uses it to recognize task groups and
+	control disk bandwidth allocation (proportional time slice allocation)
+	to such task groups.
+
+config DEBUG_BLK_CGROUP
+	bool
+	depends on BLK_CGROUP
+	default n
+	---help---
+	Enable some debugging help. Currently it stores the cgroup path
+	in the blk group which can be used by cfq for tracing various
+	group related activity.
+
 endif # BLOCK

 config BLOCK_COMPAT

--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -12,24 +12,14 @@ config IOSCHED_NOOP
 	  that do their own scheduling and require only minimal assistance from
 	  the kernel.

-config IOSCHED_AS
-	tristate "Anticipatory I/O scheduler"
-	default y
-	---help---
-	  The anticipatory I/O scheduler is generally a good choice for most
-	  environments, but is quite large and complex when compared to the
-	  deadline I/O scheduler, it can also be slower in some cases
-	  especially some database loads.
-
 config IOSCHED_DEADLINE
 	tristate "Deadline I/O scheduler"
 	default y
 	---help---
-	  The deadline I/O scheduler is simple and compact, and is often as
-	  good as the anticipatory I/O scheduler, and in some database
-	  workloads, better. In the case of a single process performing I/O to
-	  a disk at any one time, its behaviour is almost identical to the
-	  anticipatory I/O scheduler and so is a good choice.
+	  The deadline I/O scheduler is simple and compact. It will provide
+	  CSCAN service with FIFO expiration of requests, switching to
+	  a new point in the service tree and doing a batch of IO from there
+	  in case of expiry.

 config IOSCHED_CFQ
 	tristate "CFQ I/O scheduler"
@@ -37,9 +27,28 @@ config IOSCHED_CFQ
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
 	  among all processes in the system. It should provide a fair
-	  working environment, suitable for desktop systems.
+	  and low latency working environment, suitable for both desktop
+	  and server systems.
+
 	  This is the default I/O scheduler.

+config CFQ_GROUP_IOSCHED
+	bool "CFQ Group Scheduling support"
+	depends on IOSCHED_CFQ && CGROUPS
+	select BLK_CGROUP
+	default n
+	---help---
+	  Enable group IO scheduling in CFQ.
+
+config DEBUG_CFQ_IOSCHED
+	bool "Debug CFQ Scheduling"
+	depends on CFQ_GROUP_IOSCHED
+	select DEBUG_BLK_CGROUP
+	default n
+	---help---
+	  Enable CFQ IO scheduling debugging in CFQ. Currently it makes
+	  blktrace output more verbose.
+
 choice
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
@@ -47,9 +56,6 @@ choice
 	  Select the I/O scheduler which will be used by default for all
 	  block devices.

-	config DEFAULT_AS
-		bool "Anticipatory" if IOSCHED_AS=y
-
 	config DEFAULT_DEADLINE
 		bool "Deadline" if IOSCHED_DEADLINE=y

@@ -63,7 +69,6 @@ endchoice

 config DEFAULT_IOSCHED
 	string
-	default "anticipatory" if DEFAULT_AS
 	default "deadline" if DEFAULT_DEADLINE
 	default "cfq" if DEFAULT_CFQ
 	default "noop" if DEFAULT_NOOP

--- a/block/Makefile
+++ b/block/Makefile
@@ -8,8 +8,8 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o

 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
+obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
-obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o


--- a/block/as-iosched.c
+++ b/block/as-iosched.c
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+#include <linux/ioprio.h>
+#include <linux/seq_file.h>
+#include <linux/kdev_t.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include "blk-cgroup.h"
+
+static DEFINE_SPINLOCK(blkio_list_lock);
+static LIST_HEAD(blkio_list);
+
+struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+EXPORT_SYMBOL_GPL(blkio_root_cgroup);
+
+bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
+{
+	if (!css_tryget(&blkcg->css))
+		return false;
+	return true;
+}
+EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
+
+void blkiocg_css_put(struct blkio_cgroup *blkcg)
+{
+	css_put(&blkcg->css);
+}
+EXPORT_SYMBOL_GPL(blkiocg_css_put);
+
+struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
+			    struct blkio_cgroup, css);
+}
+EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
+
+void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+			unsigned long time, unsigned long sectors)
+{
+	blkg->time += time;
+	blkg->sectors += sectors;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
+
+void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+			struct blkio_group *blkg, void *key, dev_t dev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
+	rcu_assign_pointer(blkg->key, key);
+	blkg->blkcg_id = css_id(&blkcg->css);
+	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	/* Need to take css reference ? */
+	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
+#endif
+	blkg->dev = dev;
+}
+EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
+
+static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+	hlist_del_init_rcu(&blkg->blkcg_node);
+	blkg->blkcg_id = 0;
+}
+
+/*
+ * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
+ * indicating that blk_group was unhashed by the time we got to it.
+ */
+int blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+	struct blkio_cgroup *blkcg;
+	unsigned long flags;
+	struct cgroup_subsys_state *css;
+	int ret = 1;
+
+	rcu_read_lock();
+	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
+	if (!css)
+		goto out;
+
+	blkcg = container_of(css, struct blkio_cgroup, css);
+	spin_lock_irqsave(&blkcg->lock, flags);
+	if (!hlist_unhashed(&blkg->blkcg_node)) {
+		__blkiocg_del_blkio_group(blkg);
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+out:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
+
+/* called under rcu_read_lock(). */
+struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	void *__key;
+
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		__key = blkg->key;
+		if (__key == key)
+			return blkg;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
+
+#define SHOW_FUNCTION(__VAR)						\
+static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
+				       struct cftype *cftype)		\
+{									\
+	struct blkio_cgroup *blkcg;					\
+									\
+	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
+	return (u64)blkcg->__VAR;					\
+}
+
+SHOW_FUNCTION(weight);
+#undef SHOW_FUNCTION
+
+static int
+blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+	struct blkio_cgroup *blkcg;
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	struct blkio_policy_type *blkiop;
+
+	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+		return -EINVAL;
+
+	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	spin_lock_irq(&blkcg->lock);
+	blkcg->weight = (unsigned int)val;
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		spin_lock(&blkio_list_lock);
+		list_for_each_entry(blkiop, &blkio_list, list)
+			blkiop->ops.blkio_update_group_weight_fn(blkg,
+					blkcg->weight);
+		spin_unlock(&blkio_list_lock);
+	}
+	spin_unlock_irq(&blkcg->lock);
+	return 0;
+}
+
+#define SHOW_FUNCTION_PER_GROUP(__VAR)					\
+static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
+			struct cftype *cftype, struct seq_file *m)	\
+{									\
+	struct blkio_cgroup *blkcg;					\
+	struct blkio_group *blkg;					\
+	struct hlist_node *n;						\
+									\
+	if (!cgroup_lock_live_group(cgroup))				\
+		return -ENODEV;						\
+									\
+	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
+	rcu_read_lock();						\
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
+		if (blkg->dev)						\
+			seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev),	\
+				 MINOR(blkg->dev), blkg->__VAR);	\
+	}								\
+	rcu_read_unlock();						\
+	cgroup_unlock();						\
+	return 0;							\
+}
+
+SHOW_FUNCTION_PER_GROUP(time);
+SHOW_FUNCTION_PER_GROUP(sectors);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+SHOW_FUNCTION_PER_GROUP(dequeue);
+#endif
+#undef SHOW_FUNCTION_PER_GROUP
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+			unsigned long dequeue)
+{
+	blkg->dequeue += dequeue;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
+#endif
+
+struct cftype blkio_files[] = {
+	{
+		.name = "weight",
+		.read_u64 = blkiocg_weight_read,
+		.write_u64 = blkiocg_weight_write,
+	},
+	{
+		.name = "time",
+		.read_seq_string = blkiocg_time_read,
+	},
+	{
+		.name = "sectors",
+		.read_seq_string = blkiocg_sectors_read,
+	},
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       {
+		.name = "dequeue",
+		.read_seq_string = blkiocg_dequeue_read,
+       },
+#endif
+};
+
+static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	return cgroup_add_files(cgroup, subsys, blkio_files,
+				ARRAY_SIZE(blkio_files));
+}
+
+static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	unsigned long flags;
+	struct blkio_group *blkg;
+	void *key;
+	struct blkio_policy_type *blkiop;
+
+	rcu_read_lock();
+remove_entry:
+	spin_lock_irqsave(&blkcg->lock, flags);
+
+	if (hlist_empty(&blkcg->blkg_list)) {
+		spin_unlock_irqrestore(&blkcg->lock, flags);
+		goto done;
+	}
+
+	blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
+				blkcg_node);
+	key = rcu_dereference(blkg->key);
+	__blkiocg_del_blkio_group(blkg);
+
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	/*
+	 * This blkio_group is being unlinked as associated cgroup is going
+	 * away. Let all the IO controlling policies know about this event.
+	 *
+	 * Currently this is static call to one io controlling policy. Once
+	 * we have more policies in place, we need some dynamic registration
+	 * of callback function.
+	 */
+	spin_lock(&blkio_list_lock);
+	list_for_each_entry(blkiop, &blkio_list, list)
+		blkiop->ops.blkio_unlink_group_fn(key, blkg);
+	spin_unlock(&blkio_list_lock);
+	goto remove_entry;
+done:
+	free_css_id(&blkio_subsys, &blkcg->css);
+	rcu_read_unlock();
+	kfree(blkcg);
+}
+
+static struct cgroup_subsys_state *
+blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg, *parent_blkcg;
+
+	if (!cgroup->parent) {
+		blkcg = &blkio_root_cgroup;
+		goto done;
+	}
+
+	/* Currently we do not support hierarchy deeper than two level (0,1) */
+	parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
+	if (css_depth(&parent_blkcg->css) > 0)
+		return ERR_PTR(-EINVAL);
+
+	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+	if (!blkcg)
+		return ERR_PTR(-ENOMEM);
+
+	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
+done:
+	spin_lock_init(&blkcg->lock);
+	INIT_HLIST_HEAD(&blkcg->blkg_list);
+
+	return &blkcg->css;
+}
+
+/*
+ * We cannot support shared io contexts, as we have no mean to support
+ * two tasks with the same ioc in two different groups without major rework
+ * of the main cic data structures.  For now we allow a task to change
+ * its cgroup only if it's the only owner of its ioc.
+ */
+static int blkiocg_can_attach(struct cgroup_subsys *subsys,
+				struct cgroup *cgroup, struct task_struct *tsk,
+				bool threadgroup)
+{
+	struct io_context *ioc;
+	int ret = 0;
+
+	/* task_lock() is needed to avoid races with exit_io_context() */
+	task_lock(tsk);
+	ioc = tsk->io_context;
+	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+		ret = -EINVAL;
+	task_unlock(tsk);
+
+	return ret;
+}
+
+static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+				struct cgroup *prev, struct task_struct *tsk,
+				bool threadgroup)
+{
+	struct io_context *ioc;
+
+	task_lock(tsk);
+	ioc = tsk->io_context;
+	if (ioc)
+		ioc->cgroup_changed = 1;
+	task_unlock(tsk);
+}
+
+struct cgroup_subsys blkio_subsys = {
+	.name = "blkio",
+	.create = blkiocg_create,
+	.can_attach = blkiocg_can_attach,
+	.attach = blkiocg_attach,
+	.destroy = blkiocg_destroy,
+	.populate = blkiocg_populate,
+	.subsys_id = blkio_subsys_id,
+	.use_id = 1,
+};
+
+void blkio_policy_register(struct blkio_policy_type *blkiop)
+{
+	spin_lock(&blkio_list_lock);
+	list_add_tail(&blkiop->list, &blkio_list);
+	spin_unlock(&blkio_list_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_policy_register);
+
+void blkio_policy_unregister(struct blkio_policy_type *blkiop)
+{
+	spin_lock(&blkio_list_lock);
+	list_del_init(&blkiop->list);
+	spin_unlock(&blkio_list_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_policy_unregister);
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
+#ifndef _BLK_CGROUP_H
+#define _BLK_CGROUP_H
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/cgroup.h>
+
+#ifdef CONFIG_BLK_CGROUP
+
+struct blkio_cgroup {
+	struct cgroup_subsys_state css;
+	unsigned int weight;
+	spinlock_t lock;
+	struct hlist_head blkg_list;
+};
+
+struct blkio_group {
+	/* An rcu protected unique identifier for the group */
+	void *key;
+	struct hlist_node blkcg_node;
+	unsigned short blkcg_id;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	/* Store cgroup path */
+	char path[128];
+	/* How many times this group has been removed from service tree */
+	unsigned long dequeue;
+#endif
+	/* The device MKDEV(major, minor), this group has been created for */
+	dev_t   dev;
+
+	/* total disk time and nr sectors dispatched by this group */
+	unsigned long time;
+	unsigned long sectors;
+};
+
+extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
+extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
+
+typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
+typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
+						unsigned int weight);
+
+struct blkio_policy_ops {
+	blkio_unlink_group_fn *blkio_unlink_group_fn;
+	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
+};
+
+struct blkio_policy_type {
+	struct list_head list;
+	struct blkio_policy_ops ops;
+};
+
+/* Blkio controller policy registration */
+extern void blkio_policy_register(struct blkio_policy_type *);
+extern void blkio_policy_unregister(struct blkio_policy_type *);
+
+#else
+
+struct blkio_group {
+};
+
+struct blkio_policy_type {
+};
+
+static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
+static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
+
+#endif
+
+#define BLKIO_WEIGHT_MIN	100
+#define BLKIO_WEIGHT_MAX	1000
+#define BLKIO_WEIGHT_DEFAULT	500
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+static inline char *blkg_path(struct blkio_group *blkg)
+{
+	return blkg->path;
+}
+void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+				unsigned long dequeue);
+#else
+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+static inline void blkiocg_update_blkio_group_dequeue_stats(
+			struct blkio_group *blkg, unsigned long dequeue) {}
+#endif
+
+#ifdef CONFIG_BLK_CGROUP
+extern struct blkio_cgroup blkio_root_cgroup;
+extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
+extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+			struct blkio_group *blkg, void *key, dev_t dev);
+extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
+extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
+						void *key);
+void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+			unsigned long time, unsigned long sectors);
+#else
+struct cgroup;
+static inline struct blkio_cgroup *
+cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
+
+static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+			struct blkio_group *blkg, void *key, dev_t dev)
+{
+}
+
+static inline int
+blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
+
+static inline struct blkio_group *
+blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
+static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+			unsigned long time, unsigned long sectors)
+{
+}
+#endif
+#endif /* _BLK_CGROUP_H */
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2358,6 +2358,25 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }

+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+/**
+ * rq_flush_dcache_pages - Helper function to flush all pages in a request
+ * @rq: the request to be flushed
+ *
+ * Description:
+ *     Flush all pages in @rq.
+ */
+void rq_flush_dcache_pages(struct request *rq)
+{
+	struct req_iterator iter;
+	struct bio_vec *bvec;
+
+	rq_for_each_segment(bvec, rq, iter)
+		flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
+#endif
+
 /**
 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
 * @q : the queue of the device being checked

--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -66,22 +66,22 @@ static void cfq_exit(struct io_context *ioc)
 }

 /* Called by the exitting task */
-void exit_io_context(void)
+void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;

-	task_lock(current);
-	ioc = current->io_context;
-	current->io_context = NULL;
-	task_unlock(current);
+	task_lock(task);
+	ioc = task->io_context;
+	task->io_context = NULL;
+	task_unlock(task);

 	if (atomic_dec_and_test(&ioc->nr_tasks)) {
 		if (ioc->aic && ioc->aic->exit)
 			ioc->aic->exit(ioc->aic);
 		cfq_exit(ioc);

-		put_io_context(ioc);
 	}
+	put_io_context(ioc);
 }

 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)

--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/gcd.h>
+#include <linux/jiffies.h>

 #include "blk.h"

@@ -96,7 +97,11 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segment_size = MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
-	lim->max_discard_sectors = SAFE_MAX_SECTORS;
+	lim->max_discard_sectors = 0;
+	lim->discard_granularity = 0;
+	lim->discard_alignment = 0;
+	lim->discard_misaligned = 0;
+	lim->discard_zeroes_data = -1;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -141,7 +146,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->nr_batching = BLK_BATCH_REQ;

 	q->unplug_thresh = 4;		/* hmm */
-	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
+	q->unplug_delay = msecs_to_jiffies(3);	/* 3 milliseconds */
 	if (q->unplug_delay == 0)
 		q->unplug_delay = 1;

@@ -488,6 +493,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);

+static unsigned int lcm(unsigned int a, unsigned int b)
+{
+	if (a && b)
+		return (a * b) / gcd(a, b);
+	else if (b)
+		return b;
+
+	return a;
+}
+
 /**
 * blk_stack_limits - adjust queue_limits for stacked devices
 * @t:	the stacking driver limits (top)
@@ -502,6 +517,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
+	int ret;
+
+	ret = 0;
+
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -526,12 +545,19 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,

 	t->io_min = max(t->io_min, b->io_min);
 	t->no_cluster |= b->no_cluster;
+	t->discard_zeroes_data &= b->discard_zeroes_data;

 	/* Bottom device offset aligned? */
 	if (offset &&
 	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
+	}
+
+	if (offset &&
+	    (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
+		t->discard_misaligned = 1;
+		ret = -1;
 	}

 	/* If top has no alignment offset, inherit from bottom */
@@ -539,23 +565,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		t->alignment_offset =
 			b->alignment_offset & (b->physical_block_size - 1);

+	if (!t->discard_alignment)
+		t->discard_alignment =
+			b->discard_alignment & (b->discard_granularity - 1);
+
 	/* Top device aligned on logical block boundary? */
 	if (t->alignment_offset & (t->logical_block_size - 1)) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
 	}

-	/* Find lcm() of optimal I/O size */
-	if (t->io_opt && b->io_opt)
-		t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt);
-	else if (b->io_opt)
-		t->io_opt = b->io_opt;
+	/* Find lcm() of optimal I/O size and granularity */
+	t->io_opt = lcm(t->io_opt, b->io_opt);
+	t->discard_granularity = lcm(t->discard_granularity,
+				     b->discard_granularity);

 	/* Verify that optimal I/O size is a multiple of io_min */
 	if (t->io_min && t->io_opt % t->io_min)
-		return -1;
+		ret = -1;

-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);


--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -126,6 +126,21 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_io_opt(q), page);
 }

+static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.discard_granularity, page);
+}
+
+static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_discard_sectors << 9, page);
+}
+
+static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_discard_zeroes_data(q), page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -293,6 +308,21 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
 	.show = queue_io_opt_show,
 };

+static struct queue_sysfs_entry queue_discard_granularity_entry = {
+	.attr = {.name = "discard_granularity", .mode = S_IRUGO },
+	.show = queue_discard_granularity_show,
+};
+
+static struct queue_sysfs_entry queue_discard_max_entry = {
+	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+	.show = queue_discard_max_show,
+};
+
+static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
+	.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
+	.show = queue_discard_zeroes_data_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -328,6 +358,9 @@ static struct attribute *default_attrs[] = {
 	&queue_physical_block_size_entry.attr,
 	&queue_io_min_entry.attr,
 	&queue_io_opt_entry.attr,
+	&queue_discard_granularity_entry.attr,
+	&queue_discard_max_entry.attr,
+	&queue_discard_zeroes_data_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,

--- a/block/bsg.c
+++ b/block/bsg.c
@@ -15,6 +15,7 @@
 #include <linux/blkdev.h>
 #include <linux/poll.h>
 #include <linux/cdev.h>
+#include <linux/jiffies.h>
 #include <linux/percpu.h>
 #include <linux/uio.h>
 #include <linux/idr.h>
@@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 	rq->cmd_len = hdr->request_len;
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;

-	rq->timeout = (hdr->timeout * HZ) / 1000;
+	rq->timeout = msecs_to_jiffies(hdr->timeout);
 	if (!rq->timeout)
 		rq->timeout = q->sg_timeout;
 	if (!rq->timeout)

--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_uint(arg, bdev_io_opt(bdev));
 	case BLKALIGNOFF:
 		return compat_put_int(arg, bdev_alignment_offset(bdev));
+	case BLKDISCARDZEROES:
+		return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:

--- a/block/elevator.c
+++ b/block/elevator.c
@@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name)

 		spin_unlock(&elv_list_lock);

-		if (!strcmp(name, "anticipatory"))
-			sprintf(elv, "as-iosched");
-		else
-			sprintf(elv, "%s-iosched", name);
+		sprintf(elv, "%s-iosched", name);

 		request_module("%s", elv);
 		spin_lock(&elv_list_lock);
@@ -193,10 +190,7 @@ static int __init elevator_setup(char *str)
 	 * Be backwards-compatible with previous kernels, so users
 	 * won't get the wrong elevator.
 	 */
-	if (!strcmp(str, "as"))
-		strcpy(chosen_elevator, "anticipatory");
-	else
-		strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
+	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
 	return 1;
 }


--- a/block/genhd.c
+++ b/block/genhd.c
@@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
 }

+static ssize_t disk_discard_alignment_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
+		   NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,

--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -280,6 +280,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return put_uint(arg, bdev_io_opt(bdev));
 	case BLKALIGNOFF:
 		return put_int(arg, bdev_alignment_offset(bdev));
+	case BLKDISCARDZEROES:
+		return put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET:

--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -35,7 +35,9 @@
 struct blk_cmd_filter {
 	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
 	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
-} blk_default_cmd_filter;
+};
+
+static struct blk_cmd_filter blk_default_cmd_filter;

 /* Command group 3 is reserved and should never be used.  */
 const unsigned char scsi_command_size_tbl[8] =
@@ -675,7 +677,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);

-int __init blk_scsi_ioctl_init(void)
+static int __init blk_scsi_ioctl_init(void)
 {
 	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
 	return 0;

--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
 	  instead, which can be configured to be on-disk compatible with the
 	  cryptoloop device.

+source "drivers/block/drbd/Kconfig"
+
 config BLK_DEV_NBD
 	tristate "Network block device support"
 	depends on NET

--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 obj-$(CONFIG_BLK_DEV_HD)	+= hd.o

 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/

 swim_mod-objs	:= swim.o swim_asm.o
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -55,7 +55,13 @@ typedef struct _drive_info_struct
 	char device_initialized;     /* indicates whether dev is initialized */
 } drive_info_struct;

-struct ctlr_info 
+struct Cmd_sg_list {
+	SGDescriptor_struct	*sgchain;
+	dma_addr_t		sg_chain_dma;
+	int			chain_block_size;
+};
+
+struct ctlr_info
 {
 	int	ctlr;
 	char	devname[8];
@@ -75,6 +81,16 @@ struct ctlr_info
 	int	num_luns;
 	int 	highest_lun;
 	int	usage_count;  /* number of opens all all minor devices */
+	/* Need space for temp sg list
+	 * number of scatter/gathers supported
+	 * number of scatter/gathers in chained block
+	 */
+	struct	scatterlist **scatter_list;
+	int	maxsgentries;
+	int	chainsize;
+	int	max_cmd_sgentries;
+	struct Cmd_sg_list **cmd_sg_list;
+
 #	define DOORBELL_INT	0
 #	define PERF_MODE_INT	1
 #	define SIMPLE_MODE_INT	2

--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -7,7 +7,8 @@

 //general boundary defintions
 #define SENSEINFOBYTES          32//note that this value may vary between host implementations
-#define MAXSGENTRIES            31
+#define MAXSGENTRIES            32
+#define CCISS_SG_CHAIN          0x80000000
 #define MAXREPLYQS              256

 //Command Status value
@@ -319,6 +320,10 @@ typedef struct _CfgTable_struct {
  BYTE             ServerName[16];
  DWORD            HeartBeat;
  DWORD            SCSI_Prefetch;
+  DWORD            MaxSGElements;
+  DWORD            MaxLogicalUnits;
+  DWORD            MaxPhysicalDrives;
+  DWORD            MaxPhysicalDrivesPerLogicalUnit;
 } CfgTable_struct;
 #pragma pack()	 
 #endif // CCISS_CMD_H
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -755,7 +755,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
                        			cp,  
 						ei->ScsiStatus); 
 #endif
-					cmd->result |= (ei->ScsiStatus < 1);
+					cmd->result |= (ei->ScsiStatus << 1);
                		}
 				else {  /* scsi status is zero??? How??? */
 					
@@ -1547,7 +1547,7 @@ cciss_engage_scsi(int ctlr)
 	if (sa->registered) {
 		printk("cciss%d: SCSI subsystem already engaged.\n", ctlr);
 		spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-		return ENXIO;
+		return -ENXIO;
 	}
 	sa->registered = 1;
 	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);

--- a/drivers/block/drbd/Kconfig
+++ b/drivers/block/drbd/Kconfig
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
+	depends on !PROC_FS || !INET || !CONNECTOR
+
+config BLK_DEV_DRBD
+	tristate "DRBD Distributed Replicated Block Device support"
+	depends on PROC_FS && INET && CONNECTOR
+	select LRU_CACHE
+	default n
+	help
+
+	  NOTE: In order to authenticate connections you have to select
+	  CRYPTO_HMAC and a hash function as well.
+
+	  DRBD is a shared-nothing, synchronously replicated block device. It
+	  is designed to serve as a building block for high availability
+	  clusters and in this context, is a "drop-in" replacement for shared
+	  storage. Simplistically, you could see it as a network RAID 1.
+
+	  Each minor device has a role, which can be 'primary' or 'secondary'.
+	  On the node with the primary device the application is supposed to
+	  run and to access the device (/dev/drbdX). Every write is sent to
+	  the local 'lower level block device' and, across the network, to the
+	  node with the device in 'secondary' state.  The secondary device
+	  simply writes the data to its lower level block device.
+
+	  DRBD can also be used in dual-Primary mode (device writable on both
+	  nodes), which means it can exhibit shared disk semantics in a
+	  shared-nothing cluster.  Needless to say, on top of dual-Primary
+	  DRBD utilizing a cluster file system is necessary to maintain for
+	  cache coherency.
+
+	  For automatic failover you need a cluster manager (e.g. heartbeat).
+	  See also: http://www.drbd.org/, http://www.linux-ha.org
+
+	  If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+	bool "DRBD fault injection"
+	depends on BLK_DEV_DRBD
+	help
+
+	  Say Y here if you want to simulate IO errors, in order to test DRBD's
+	  behavior.
+
+	  The actual simulation of IO errors is done by writing 3 values to
+	  /sys/module/drbd/parameters/
+
+	  enable_faults: bitmask of...
+	  1	meta data write
+	  2               read
+	  4	resync data write
+	  8	            read
+	  16	data write
+	  32	data read
+	  64	read ahead
+	  128	kmalloc of bitmap
+	  256	allocation of EE (epoch_entries)
+
+	  fault_devs: bitmask of minor numbers
+	  fault_rate: frequency in percent
+
+	  Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+		echo 16 > /sys/module/drbd/parameters/enable_faults
+		echo 1 > /sys/module/drbd/parameters/fault_devs
+		echo 5 > /sys/module/drbd/parameters/fault_rate
+
+	  If unsure, say N.
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
--- a/drivers/block/drbd/drbd_vli.h
+++ b/drivers/block/drbd/drbd_vli.h
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -123,7 +123,15 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev,
 {
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
-	unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms);
+	unsigned long timeout;
+
+	for (timeout = 20; timeout; timeout--) {
+		if (!notify[3])
+			return 0;
+		udelay(10);
+	}
+
+	timeout = jiffies + msecs_to_jiffies(timeout_ms);

 	do {
 		if (!notify[3])

--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -59,12 +59,14 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->readsect(dev, block, buf))
 				return -EIO;
+		rq_flush_dcache_pages(req);
 		return 0;

 	case WRITE:
 		if (!tr->writesect)
 			return -EIO;

+		rq_flush_dcache_pages(req);
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->writesect(dev, block, buf))
 				return -EIO;

--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -143,7 +143,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c
 	struct inode *inode = mapping->host;
 	struct pohmelfs_inode *pi = POHMELFS_I(inode);
 	struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb);
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int err = 0;
 	int done = 0;
 	int nr_pages;
@@ -152,11 +151,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c
 	int scanned = 0;
 	int range_whole = 0;

-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
@@ -248,10 +242,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c

 			if (wbc->nr_to_write <= 0)
 				done = 1;
-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			}

 			continue;
 out_continue:

--- a/fs/aio.c
+++ b/fs/aio.c
--- a/fs/bio.c
+++ b/fs/bio.c
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
 {
 	if (wbc->for_reclaim)
 		return FLUSH_HIGHPRI | FLUSH_STABLE;
-	if (wbc->for_kupdate)
+	if (wbc->for_kupdate || wbc->for_background)
 		return FLUSH_LOWPRI;
 	return 0;
 }

--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
--- a/fs/read_write.c
+++ b/fs/read_write.c
--- a/fs/splice.c
+++ b/fs/splice.c
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
--- a/include/asm-generic/cacheflush.h
+++ b/include/asm-generic/cacheflush.h
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
--- a/include/linux/drbd_nl.h
+++ b/include/linux/drbd_nl.h
--- a/include/linux/drbd_tag_magic.h
+++ b/include/linux/drbd_tag_magic.h
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
--- a/kernel/exit.c
+++ b/kernel/exit.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
--- a/lib/Kconfig
+++ b/lib/Kconfig
--- a/lib/Makefile
+++ b/lib/Makefile
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c