Merge branch 'master' into for-2.6.33
Jens Axboe [Thu, 3 Dec 2009 12:49:39 +0000 (13:49 +0100)]
92 files changed:
Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg [new file with mode: 0644]
Documentation/blockdev/drbd/DRBD-data-packets.svg [new file with mode: 0644]
Documentation/blockdev/drbd/README.txt [new file with mode: 0644]
Documentation/blockdev/drbd/conn-states-8.dot [new file with mode: 0644]
Documentation/blockdev/drbd/disk-states-8.dot [new file with mode: 0644]
Documentation/blockdev/drbd/drbd-connection-state-overview.dot [new file with mode: 0644]
Documentation/blockdev/drbd/node-states-8.dot [new file with mode: 0644]
MAINTAINERS
arch/alpha/include/asm/cacheflush.h
arch/arm/include/asm/cacheflush.h
arch/avr32/include/asm/cacheflush.h
arch/blackfin/include/asm/cacheflush.h
arch/cris/include/asm/cacheflush.h
arch/frv/include/asm/cacheflush.h
arch/h8300/include/asm/cacheflush.h
arch/ia64/include/asm/cacheflush.h
arch/m32r/include/asm/cacheflush.h
arch/m68k/include/asm/cacheflush_mm.h
arch/m68k/include/asm/cacheflush_no.h
arch/microblaze/include/asm/cacheflush.h
arch/mips/include/asm/cacheflush.h
arch/mn10300/include/asm/cacheflush.h
arch/parisc/include/asm/cacheflush.h
arch/powerpc/include/asm/cacheflush.h
arch/s390/include/asm/cacheflush.h
arch/score/include/asm/cacheflush.h
arch/sh/include/asm/cacheflush.h
arch/sparc/include/asm/cacheflush_32.h
arch/sparc/include/asm/cacheflush_64.h
arch/x86/include/asm/cacheflush.h
arch/xtensa/include/asm/cacheflush.h
block/Kconfig.iosched
block/Makefile
block/as-iosched.c [deleted file]
block/blk-core.c
block/blk-settings.c
block/blk-sysfs.c
block/bsg.c
block/cfq-iosched.c
block/compat_ioctl.c
block/elevator.c
block/genhd.c
block/ioctl.c
block/scsi_ioctl.c
drivers/block/Kconfig
drivers/block/Makefile
drivers/block/cciss.c
drivers/block/cciss.h
drivers/block/cciss_cmd.h
drivers/block/cciss_scsi.c
drivers/block/drbd/Kconfig [new file with mode: 0644]
drivers/block/drbd/Makefile [new file with mode: 0644]
drivers/block/drbd/drbd_actlog.c [new file with mode: 0644]
drivers/block/drbd/drbd_bitmap.c [new file with mode: 0644]
drivers/block/drbd/drbd_int.h [new file with mode: 0644]
drivers/block/drbd/drbd_main.c [new file with mode: 0644]
drivers/block/drbd/drbd_nl.c [new file with mode: 0644]
drivers/block/drbd/drbd_proc.c [new file with mode: 0644]
drivers/block/drbd/drbd_receiver.c [new file with mode: 0644]
drivers/block/drbd/drbd_req.c [new file with mode: 0644]
drivers/block/drbd/drbd_req.h [new file with mode: 0644]
drivers/block/drbd/drbd_strings.c [new file with mode: 0644]
drivers/block/drbd/drbd_vli.h [new file with mode: 0644]
drivers/block/drbd/drbd_worker.c [new file with mode: 0644]
drivers/block/drbd/drbd_wrappers.h [new file with mode: 0644]
drivers/block/ps3vram.c
drivers/mtd/mtd_blkdevs.c
fs/aio.c
fs/bio.c
fs/block_dev.c
fs/direct-io.c
fs/partitions/check.c
fs/partitions/efi.c
fs/partitions/efi.h
fs/read_write.c
fs/splice.c
include/asm-generic/cacheflush.h
include/linux/backing-dev.h
include/linux/bio.h
include/linux/blkdev.h
include/linux/connector.h
include/linux/drbd.h [new file with mode: 0644]
include/linux/drbd_limits.h [new file with mode: 0644]
include/linux/drbd_nl.h [new file with mode: 0644]
include/linux/drbd_tag_magic.h [new file with mode: 0644]
include/linux/fs.h
include/linux/genhd.h
include/linux/iocontext.h
include/linux/lru_cache.h [new file with mode: 0644]
lib/Kconfig
lib/Makefile
lib/lru_cache.c [new file with mode: 0644]

diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
new file mode 100644 (file)
index 0000000..f87cfa0
--- /dev/null
@@ -0,0 +1,588 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   version="1.0"
+   width="210mm"
+   height="297mm"
+   viewBox="0 0 21000 29700"
+   id="svg2"
+   style="fill-rule:evenodd">
+  <defs
+     id="defs4" />
+  <g
+     id="Default"
+     style="visibility:visible">
+    <desc
+       id="desc180">Master slide</desc>
+  </g>
+  <path
+     d="M 11999,8601 L 11899,8301 L 12099,8301 L 11999,8601 z"
+     id="path193"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,7801 L 11999,8361"
+     id="path197"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 7999,10401 L 7899,10101 L 8099,10101 L 7999,10401 z"
+     id="path209"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,9601 L 7999,10161"
+     id="path213"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 11999,7801 L 11685,7840 L 11724,7644 L 11999,7801 z"
+     id="path225"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,7001 L 11764,7754"
+     id="path229"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-1244.4792,1416.5139)"
+     id="g245"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text247">
+      <tspan
+         x="9139 9368 9579 9808 9986 10075 10252 10481 10659 10837 10909"
+         y="9284"
+         id="tspan249">RSDataReply</tspan>
+    </text>
+  </g>
+  <path
+     d="M 7999,9601 L 8281,9458 L 8311,9655 L 7999,9601 z"
+     id="path259"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,9001 L 8236,9565"
+     id="path263"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,1620.9382,-1639.4947)"
+     id="g279"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text281">
+      <tspan
+         x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
+         y="7023"
+         id="tspan283">CsumRSRequest</tspan>
+    </text>
+  </g>
+  <text
+     id="text297"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
+       y="5707"
+       id="tspan299">w_make_resync_request()</tspan>
+  </text>
+  <text
+     id="text313"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
+       y="7806"
+       id="tspan315">receive_DataRequest()</tspan>
+  </text>
+  <text
+     id="text329"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
+       y="8606"
+       id="tspan331">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text345"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
+       y="9007"
+       id="tspan347">w_e_end_csum_rs_req()</tspan>
+  </text>
+  <text
+     id="text361"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
+       y="9507"
+       id="tspan363">receive_RSDataReply()</tspan>
+  </text>
+  <text
+     id="text377"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
+       y="10407"
+       id="tspan379">drbd_endio_write_sec()</tspan>
+  </text>
+  <text
+     id="text393"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
+       y="10907"
+       id="tspan395">e_end_resync_block()</tspan>
+  </text>
+  <path
+     d="M 11999,11601 L 11685,11640 L 11724,11444 L 11999,11601 z"
+     id="path405"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 7999,10801 L 11764,11554"
+     id="path409"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,2434.7562,-1674.649)"
+     id="g425"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text427">
+      <tspan
+         x="9320 9621 9726 9798 9887 10065 10277 10438"
+         y="10943"
+         id="tspan429">WriteAck</tspan>
+    </text>
+  </g>
+  <text
+     id="text443"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
+       y="11559"
+       id="tspan445">got_BlockAck()</tspan>
+  </text>
+  <text
+     id="text459"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14302 14540 14658 14777 14870 15107 15225 15437 15649 15886"
+       y="4877"
+       id="tspan461">Checksum based Resync, case not in sync</tspan>
+  </text>
+  <text
+     id="text475"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="6961 7266 7571 7854 8159 8299 8536 8654 8891 9010 9247 9484 9603 9840 9958 10077 10170 10407"
+       y="2806"
+       id="tspan477">DRBD-8.3 data flow</tspan>
+  </text>
+  <text
+     id="text491"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
+       y="7005"
+       id="tspan493">w_e_send_csum()</tspan>
+  </text>
+  <path
+     d="M 11999,17601 L 11899,17301 L 12099,17301 L 11999,17601 z"
+     id="path503"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,16801 L 11999,17361"
+     id="path507"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 11999,16801 L 11685,16840 L 11724,16644 L 11999,16801 z"
+     id="path519"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,16001 L 11764,16754"
+     id="path523"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-2539.5806,1529.3491)"
+     id="g539"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text541">
+      <tspan
+         x="9269 9498 9709 9798 9959 10048 10226 10437 10598 10776"
+         y="18265"
+         id="tspan543">RSIsInSync</tspan>
+    </text>
+  </g>
+  <path
+     d="M 7999,18601 L 8281,18458 L 8311,18655 L 7999,18601 z"
+     id="path553"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 11999,18001 L 8236,18565"
+     id="path557"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,3461.4027,-1449.3012)"
+     id="g573"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text575">
+      <tspan
+         x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
+         y="16023"
+         id="tspan577">CsumRSRequest</tspan>
+    </text>
+  </g>
+  <text
+     id="text591"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
+       y="16806"
+       id="tspan593">receive_DataRequest()</tspan>
+  </text>
+  <text
+     id="text607"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
+       y="17606"
+       id="tspan609">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text623"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
+       y="18007"
+       id="tspan625">w_e_end_csum_rs_req()</tspan>
+  </text>
+  <text
+     id="text639"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5735 5913 6091 6180 6357 6446 6607 6696 6874 7085 7246 7424 7585 7691"
+       y="18507"
+       id="tspan641">got_IsInSync()</tspan>
+  </text>
+  <text
+     id="text655"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14159 14396 14514 14726 14937 15175"
+       y="13877"
+       id="tspan657">Checksum based Resync, case in sync</tspan>
+  </text>
+  <path
+     d="M 12000,24601 L 11900,24301 L 12100,24301 L 12000,24601 z"
+     id="path667"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,23801 L 12000,24361"
+     id="path671"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 8000,26401 L 7900,26101 L 8100,26101 L 8000,26401 z"
+     id="path683"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,25601 L 8000,26161"
+     id="path687"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 12000,23801 L 11686,23840 L 11725,23644 L 12000,23801 z"
+     id="path699"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,23001 L 11765,23754"
+     id="path703"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-3543.8452,1630.5143)"
+     id="g719"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text721">
+      <tspan
+         x="9464 9710 9921 10150 10328 10505 10577"
+         y="25236"
+         id="tspan723">OVReply</tspan>
+    </text>
+  </g>
+  <path
+     d="M 8000,25601 L 8282,25458 L 8312,25655 L 8000,25601 z"
+     id="path733"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,25001 L 8237,25565"
+     id="path737"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,4918.2801,-1381.2128)"
+     id="g753"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text755">
+      <tspan
+         x="9142 9388 9599 9828 10006 10183 10361 10539 10700"
+         y="23106"
+         id="tspan757">OVRequest</tspan>
+    </text>
+  </g>
+  <text
+     id="text771"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13656 13868 14097 14274 14452 14630 14808 14969 15058 15163"
+       y="23806"
+       id="tspan773">receive_OVRequest()</tspan>
+  </text>
+  <text
+     id="text787"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
+       y="24606"
+       id="tspan789">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text803"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14004 14182 14288 14465 14643 14749"
+       y="25007"
+       id="tspan805">w_e_end_ov_req()</tspan>
+  </text>
+  <text
+     id="text819"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5101 5207 5385 5546 5723 5795 5956 6134 6312 6557 6769 6998 7175 7353 7425 7586 7692"
+       y="25507"
+       id="tspan821">receive_OVReply()</tspan>
+  </text>
+  <text
+     id="text835"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
+       y="26407"
+       id="tspan837">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text851"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4902 5131 5308 5486 5664 5842 6020 6197 6375 6553 6714 6892 6998 7175 7353 7425 7586 7692"
+       y="26907"
+       id="tspan853">w_e_end_ov_reply()</tspan>
+  </text>
+  <path
+     d="M 12000,27601 L 11686,27640 L 11725,27444 L 12000,27601 z"
+     id="path863"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 8000,26801 L 11765,27554"
+     id="path867"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <g
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,5704.1907,-1328.312)"
+     id="g883"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <text
+       id="text885">
+      <tspan
+         x="9279 9525 9736 9965 10143 10303 10481 10553"
+         y="26935"
+         id="tspan887">OVResult</tspan>
+    </text>
+  </g>
+  <text
+     id="text901"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12378 12556 12645 12822 13068 13280 13508 13686 13847 14025 14097 14185 14291"
+       y="27559"
+       id="tspan903">got_OVResult()</tspan>
+  </text>
+  <text
+     id="text917"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="8000 8330 8567 8660 8754 8991 9228 9346 9558 9795 9935 10028 10146"
+       y="21877"
+       id="tspan919">Online verify</tspan>
+  </text>
+  <text
+     id="text933"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4641 4870 5047 5310 5488 5649 5826 6004 6182 6343 6521 6626 6804 6982 7160 7338 7499 7587 7693"
+       y="23005"
+       id="tspan935">w_make_ov_request()</tspan>
+  </text>
+  <path
+     d="M 8000,6500 L 7900,6200 L 8100,6200 L 8000,6500 z"
+     id="path945"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,5700 L 8000,6260"
+     id="path949"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 3900,5500 L 3700,5500 L 3700,11000 L 3900,11000"
+     id="path961"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 3900,14500 L 3700,14500 L 3700,18600 L 3900,18600"
+     id="path973"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 3900,22800 L 3700,22800 L 3700,26900 L 3900,26900"
+     id="path985"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <text
+     id="text1001"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
+       y="6506"
+       id="tspan1003">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text1017"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
+       y="14708"
+       id="tspan1019">w_make_resync_request()</tspan>
+  </text>
+  <text
+     id="text1033"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
+       y="16006"
+       id="tspan1035">w_e_send_csum()</tspan>
+  </text>
+  <path
+     d="M 8000,15501 L 7900,15201 L 8100,15201 L 8000,15501 z"
+     id="path1045"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,14701 L 8000,15261"
+     id="path1049"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     id="text1065"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
+       y="15507"
+       id="tspan1067">drbd_endio_read_sec()</tspan>
+  </text>
+  <path
+     d="M 16100,9000 L 16300,9000 L 16300,7500 L 16100,7500"
+     id="path1077"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 16100,18000 L 16300,18000 L 16300,16500 L 16100,16500"
+     id="path1089"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 16100,25000 L 16300,25000 L 16300,23500 L 16100,23500"
+     id="path1101"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <text
+     id="text1117"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
+       y="5402"
+       id="tspan1119">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1133"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2027 2133 2294 2472 2649 2827 3005 3077 3255 3432 3504 3682 3788"
+       y="14402"
+       id="tspan1135">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1149"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
+       y="22602"
+       id="tspan1151">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1165"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1426 1532 1693 1871 2031 2209 2472 2649 2721 2899 2988 3166 3344 3416 3593 3699"
+       y="11302"
+       id="tspan1167">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1181"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
+       y="18931"
+       id="tspan1183">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1197"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
+       y="27231"
+       id="tspan1199">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1213"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16126 16232 16393 16571 16748 16926 17104 17176 17354 17531 17603 17781 17887"
+       y="7402"
+       id="tspan1215">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1229"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
+       y="16331"
+       id="tspan1231">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1245"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
+       y="23302"
+       id="tspan1247">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1261"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
+       y="9302"
+       id="tspan1263">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1277"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
+       y="18331"
+       id="tspan1279">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1293"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16126 16232 16393 16571 16731 16909 17172 17349 17421 17599 17688 17866 18044 18116 18293 18399"
+       y="25302"
+       id="tspan1295">rs_complete_io()</tspan>
+  </text>
+</svg>
diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/blockdev/drbd/DRBD-data-packets.svg
new file mode 100644 (file)
index 0000000..48a1e21
--- /dev/null
@@ -0,0 +1,459 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+<svg
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   version="1.0"
+   width="210mm"
+   height="297mm"
+   viewBox="0 0 21000 29700"
+   id="svg2"
+   style="fill-rule:evenodd">
+  <defs
+     id="defs4" />
+  <g
+     id="Default"
+     style="visibility:visible">
+    <desc
+       id="desc176">Master slide</desc>
+  </g>
+  <path
+     d="M 11999,19601 L 11899,19301 L 12099,19301 L 11999,19601 z"
+     id="path189"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,18801 L 11999,19361"
+     id="path193"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 7999,21401 L 7899,21101 L 8099,21101 L 7999,21401 z"
+     id="path205"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,20601 L 7999,21161"
+     id="path209"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 11999,18801 L 11685,18840 L 11724,18644 L 11999,18801 z"
+     id="path221"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 7999,18001 L 11764,18754"
+     id="path225"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="-3023.845"
+     y="1106.8124"
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
+     id="text243"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="6115.1553 6344.1553 6555.1553 6784.1553 6962.1553 7051.1553 7228.1553 7457.1553 7635.1553 7813.1553 7885.1553"
+       y="21390.812"
+       id="tspan245">RSDataReply</tspan>
+  </text>
+  <path
+     d="M 7999,20601 L 8281,20458 L 8311,20655 L 7999,20601 z"
+     id="path255"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 11999,20001 L 8236,20565"
+     id="path259"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="3502.5356"
+     y="-2184.6621"
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
+     id="text277"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12321.536 12550.536 12761.536 12990.536 13168.536 13257.536 13434.536 13663.536 13841.536 14019.536 14196.536 14374.536 14535.536"
+       y="15854.338"
+       id="tspan279">RSDataRequest</tspan>
+  </text>
+  <text
+     id="text293"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
+       y="17807"
+       id="tspan295">w_make_resync_request()</tspan>
+  </text>
+  <text
+     id="text309"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
+       y="18806"
+       id="tspan311">receive_DataRequest()</tspan>
+  </text>
+  <text
+     id="text325"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
+       y="19606"
+       id="tspan327">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text341"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13770 13931 14109 14287 14375 14553 14731 14837 15015 15192 15298"
+       y="20007"
+       id="tspan343">w_e_end_rsdata_req()</tspan>
+  </text>
+  <text
+     id="text357"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
+       y="20507"
+       id="tspan359">receive_RSDataReply()</tspan>
+  </text>
+  <text
+     id="text373"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
+       y="21407"
+       id="tspan375">drbd_endio_write_sec()</tspan>
+  </text>
+  <text
+     id="text389"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
+       y="21907"
+       id="tspan391">e_end_resync_block()</tspan>
+  </text>
+  <path
+     d="M 11999,22601 L 11685,22640 L 11724,22444 L 11999,22601 z"
+     id="path401"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 7999,21801 L 11764,22554"
+     id="path405"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <text
+     x="4290.3008"
+     y="-2369.6162"
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
+     id="text423"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="13610.301 13911.301 14016.301 14088.301 14177.301 14355.301 14567.301 14728.301"
+       y="19573.385"
+       id="tspan425">WriteAck</tspan>
+  </text>
+  <text
+     id="text439"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
+       y="22559"
+       id="tspan441">got_BlockAck()</tspan>
+  </text>
+  <text
+     id="text455"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="7999 8304 8541 8753 8964 9201 9413 9531 9769 9862 10099 10310 10522 10734 10852 10971 11208 11348 11585 11822"
+       y="16877"
+       id="tspan457">Resync blocks, 4-32K</tspan>
+  </text>
+  <path
+     d="M 12000,7601 L 11900,7301 L 12100,7301 L 12000,7601 z"
+     id="path467"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,6801 L 12000,7361"
+     id="path471"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 12000,6801 L 11686,6840 L 11725,6644 L 12000,6801 z"
+     id="path483"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,6001 L 11765,6754"
+     id="path487"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="-1288.1796"
+     y="1279.7666"
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
+     id="text505"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="8174.8208 8475.8203 8580.8203 8652.8203 8741.8203 8919.8203 9131.8203 9292.8203"
+       y="9516.7666"
+       id="tspan507">WriteAck</tspan>
+  </text>
+  <path
+     d="M 8000,8601 L 8282,8458 L 8312,8655 L 8000,8601 z"
+     id="path517"
+     style="fill:#000080;visibility:visible" />
+  <path
+     d="M 12000,8001 L 8237,8565"
+     id="path521"
+     style="fill:none;stroke:#000080;visibility:visible" />
+  <text
+     x="1065.6655"
+     y="-2097.7664"
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
+     id="text539"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="10682.666 10911.666 11088.666 11177.666"
+       y="4107.2339"
+       id="tspan541">Data</tspan>
+  </text>
+  <text
+     id="text555"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
+       y="5505"
+       id="tspan557">drbd_make_request()</tspan>
+  </text>
+  <text
+     id="text571"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14190"
+       y="6806"
+       id="tspan573">receive_Data()</tspan>
+  </text>
+  <text
+     id="text587"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14207 14312 14384 14473 14651 14829 14990 15168 15328 15434"
+       y="7606"
+       id="tspan589">drbd_endio_write_sec()</tspan>
+  </text>
+  <text
+     id="text603"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12192 12370 12548 12725 12903 13081 13259 13437 13509 13686 13847 14008 14114"
+       y="8007"
+       id="tspan605">e_end_block()</tspan>
+  </text>
+  <text
+     id="text619"
+     style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5647 5825 6003 6092 6269 6481 6553 6731 6892 7052 7264 7425 7586 7692"
+       y="8606"
+       id="tspan621">got_BlockAck()</tspan>
+  </text>
+  <text
+     id="text635"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="8000 8305 8542 8779 9016 9109 9346 9486 9604 9956 10049 10189 10328 10565 10705 10942 11179 11298 11603 11742 11835 11954 12191 12310 12428 12665 12902 13139 13279 13516 13753"
+       y="4877"
+       id="tspan637">Regular mirrored write, 512-32K</tspan>
+  </text>
+  <text
+     id="text651"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5381 5610 5787 5948 6126 6304 6482 6659 6837 7015 7087 7265 7426 7587 7692"
+       y="6003"
+       id="tspan653">w_send_dblock()</tspan>
+  </text>
+  <path
+     d="M 8000,6800 L 7900,6500 L 8100,6500 L 8000,6800 z"
+     id="path663"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,6000 L 8000,6560"
+     id="path667"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     id="text683"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4602 4780 4886 5063 5241 5419 5597 5775 5952 6024 6202 6380 6609 6714 6786 6875 7053 7231 7409 7515 7587 7692"
+       y="6905"
+       id="tspan685">drbd_endio_write_pri()</tspan>
+  </text>
+  <path
+     d="M 12000,13602 L 11900,13302 L 12100,13302 L 12000,13602 z"
+     id="path695"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,12802 L 12000,13362"
+     id="path699"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <path
+     d="M 12000,12802 L 11686,12841 L 11725,12645 L 12000,12802 z"
+     id="path711"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 8000,12002 L 11765,12755"
+     id="path715"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="-2155.5266"
+     y="1201.5964"
+     transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
+     id="text733"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="7202.4736 7431.4736 7608.4736 7697.4736 7875.4736 8104.4736 8282.4736 8459.4736 8531.4736"
+       y="15454.597"
+       id="tspan735">DataReply</tspan>
+  </text>
+  <path
+     d="M 8000,14602 L 8282,14459 L 8312,14656 L 8000,14602 z"
+     id="path745"
+     style="fill:#008000;visibility:visible" />
+  <path
+     d="M 12000,14002 L 8237,14566"
+     id="path749"
+     style="fill:none;stroke:#008000;visibility:visible" />
+  <text
+     x="2280.3804"
+     y="-2103.2141"
+     transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
+     id="text767"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="11316.381 11545.381 11722.381 11811.381 11989.381 12218.381 12396.381 12573.381 12751.381 12929.381 13090.381"
+       y="9981.7861"
+       id="tspan769">DataRequest</tspan>
+  </text>
+  <text
+     id="text783"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
+       y="11506"
+       id="tspan785">drbd_make_request()</tspan>
+  </text>
+  <text
+     id="text799"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14312 14490 14668 14846 15024 15185 15273 15379"
+       y="12807"
+       id="tspan801">receive_DataRequest()</tspan>
+  </text>
+  <text
+     id="text815"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
+       y="13607"
+       id="tspan817">drbd_endio_read_sec()</tspan>
+  </text>
+  <text
+     id="text831"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14021 14110 14288 14465 14571 14749 14927 15033"
+       y="14008"
+       id="tspan833">w_e_end_data_req()</tspan>
+  </text>
+  <g
+     id="g835"
+     style="visibility:visible">
+    <desc
+       id="desc837">Drawing</desc>
+    <text
+       id="text847"
+       style="font-size:318px;font-weight:400;fill:#008000;font-family:Helvetica embedded">
+      <tspan
+         x="4885 4991 5169 5330 5507 5579 5740 5918 6096 6324 6502 6591 6769 6997 7175 7353 7425 7586 7692"
+         y="14607"
+         id="tspan849">receive_DataReply()</tspan>
+    </text>
+  </g>
+  <text
+     id="text863"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="8000 8305 8398 8610 8821 8914 9151 9363 9575 9693 9833 10070 10307 10544 10663 10781 11018 11255 11493 11632 11869 12106"
+       y="10878"
+       id="tspan865">Diskless read, 512-32K</tspan>
+  </text>
+  <text
+     id="text879"
+     style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="5029 5258 5435 5596 5774 5952 6130 6307 6413 6591 6769 6947 7125 7230 7408 7586 7692"
+       y="12004"
+       id="tspan881">w_send_read_req()</tspan>
+  </text>
+  <text
+     id="text895"
+     style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="6961 7266 7571 7854 8159 8278 8515 8633 8870 9107 9226 9463 9581 9700 9793 10030"
+       y="2806"
+       id="tspan897">DRBD 8 data flow</tspan>
+  </text>
+  <path
+     d="M 3900,5300 L 3700,5300 L 3700,7000 L 3900,7000"
+     id="path907"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 3900,17600 L 3700,17600 L 3700,22000 L 3900,22000"
+     id="path919"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <path
+     d="M 16100,20000 L 16300,20000 L 16300,18500 L 16100,18500"
+     id="path931"
+     style="fill:none;stroke:#000000;visibility:visible" />
+  <text
+     id="text947"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2126 2304 2376 2554 2731 2909 3087 3159 3337 3515 3587 3764 3870"
+       y="5202"
+       id="tspan949">al_begin_io()</tspan>
+  </text>
+  <text
+     id="text963"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1632 1810 1882 2060 2220 2398 2661 2839 2910 3088 3177 3355 3533 3605 3783 3888"
+       y="7331"
+       id="tspan965">al_complete_io()</tspan>
+  </text>
+  <text
+     id="text979"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="2126 2232 2393 2571 2748 2926 3104 3176 3354 3531 3603 3781 3887"
+       y="17431"
+       id="tspan981">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text995"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="1626 1732 1893 2071 2231 2409 2672 2849 2921 3099 3188 3366 3544 3616 3793 3899"
+       y="22331"
+       id="tspan997">rs_complete_io()</tspan>
+  </text>
+  <text
+     id="text1011"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16027 16133 16294 16472 16649 16827 17005 17077 17255 17432 17504 17682 17788"
+       y="18402"
+       id="tspan1013">rs_begin_io()</tspan>
+  </text>
+  <text
+     id="text1027"
+     style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
+    <tspan
+       x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
+       y="20331"
+       id="tspan1029">rs_complete_io()</tspan>
+  </text>
+</svg>
diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt
new file mode 100644 (file)
index 0000000..627b0a1
--- /dev/null
@@ -0,0 +1,16 @@
+Description
+
+  DRBD is a shared-nothing, synchronously replicated block device. It
+  is designed to serve as a building block for high availability
+  clusters and in this context, is a "drop-in" replacement for shared
+  storage. Simplistically, you could see it as a network RAID 1.
+
+  Please visit http://www.drbd.org to find out more.
+
+The here included files are intended to help understand the implementation
+
+DRBD-8.3-data-packets.svg, DRBD-data-packets.svg  
+  relates some functions, and write packets.
+
+conn-states-8.dot, disk-states-8.dot, node-states-8.dot
+  The sub graphs of DRBD's state transitions
diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/blockdev/drbd/conn-states-8.dot
new file mode 100644 (file)
index 0000000..025e8cf
--- /dev/null
@@ -0,0 +1,18 @@
+digraph conn_states {
+       StandAllone  -> WFConnection   [ label = "ioctl_set_net()" ]
+       WFConnection -> Unconnected    [ label = "unable to bind()" ]
+       WFConnection -> WFReportParams [ label = "in connect() after accept" ]
+       WFReportParams -> StandAllone  [ label = "checks in receive_param()" ]
+       WFReportParams -> Connected    [ label = "in receive_param()" ]
+       WFReportParams -> WFBitMapS    [ label = "sync_handshake()" ]
+       WFReportParams -> WFBitMapT    [ label = "sync_handshake()" ]
+       WFBitMapS -> SyncSource        [ label = "receive_bitmap()" ]
+       WFBitMapT -> SyncTarget        [ label = "receive_bitmap()" ]
+       SyncSource -> Connected
+       SyncTarget -> Connected
+       SyncSource -> PausedSyncS
+       SyncTarget -> PausedSyncT
+       PausedSyncS -> SyncSource
+       PausedSyncT -> SyncTarget
+       Connected   -> WFConnection    [ label = "* on network error" ]
+}
diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/blockdev/drbd/disk-states-8.dot
new file mode 100644 (file)
index 0000000..d06cfb4
--- /dev/null
@@ -0,0 +1,16 @@
+digraph disk_states {
+       Diskless -> Inconsistent       [ label = "ioctl_set_disk()" ]
+       Diskless -> Consistent         [ label = "ioctl_set_disk()" ]
+       Diskless -> Outdated           [ label = "ioctl_set_disk()" ]
+       Consistent -> Outdated         [ label = "receive_param()" ]
+       Consistent -> UpToDate         [ label = "receive_param()" ]
+       Consistent -> Inconsistent     [ label = "start resync" ]
+       Outdated   -> Inconsistent     [ label = "start resync" ]
+       UpToDate   -> Inconsistent     [ label = "ioctl_replicate" ]
+       Inconsistent -> UpToDate       [ label = "resync completed" ]
+       Consistent -> Failed           [ label = "io completion error" ]
+       Outdated   -> Failed           [ label = "io completion error" ]
+       UpToDate   -> Failed           [ label = "io completion error" ]
+       Inconsistent -> Failed         [ label = "io completion error" ]
+       Failed -> Diskless             [ label = "sending notify to peer" ]
+}
diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
new file mode 100644 (file)
index 0000000..6d9cf0a
--- /dev/null
@@ -0,0 +1,85 @@
+// vim: set sw=2 sts=2 :
+digraph {
+  rankdir=BT
+  bgcolor=white
+
+  node [shape=plaintext]
+  node [fontcolor=black]
+
+  StandAlone     [ style=filled,fillcolor=gray,label=StandAlone ]
+
+  node [fontcolor=lightgray]
+
+  Unconnected    [ label=Unconnected ]
+
+  CommTrouble [ shape=record,
+    label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ]
+
+  node [fontcolor=gray]
+
+  subgraph cluster_try_connect {
+    label="try to connect, handshake"
+    rank=max
+    WFConnection   [ label=WFConnection ]
+    WFReportParams [ label=WFReportParams ]
+  }
+
+  TearDown       [ label=TearDown ]
+
+  Connected      [ label=Connected,style=filled,fillcolor=green,fontcolor=black ]
+
+  node [fontcolor=lightblue]
+
+  StartingSyncS  [ label=StartingSyncS ]
+  StartingSyncT  [ label=StartingSyncT ]
+
+  subgraph cluster_bitmap_exchange {
+    node [fontcolor=red]
+    fontcolor=red
+    label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged"
+
+    WFBitMapT      [ label=WFBitMapT ]
+    WFSyncUUID     [ label=WFSyncUUID ]
+    WFBitMapS      [ label=WFBitMapS ]
+  }
+
+  node [fontcolor=blue]
+
+  cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ]
+
+  node [shape=box,fontcolor=black]
+
+  // drbdadm [label="drbdadm connect"]
+  // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."]
+  // comm_error [label="communication trouble"]
+
+  //
+  // edges
+  // --------------------------------------
+
+  StandAlone -> Unconnected [ label="drbdadm connect" ]
+  Unconnected -> StandAlone  [ label="drbdadm disconnect\lor serious communication trouble" ]
+  Unconnected -> WFConnection [ label="receiver thread is started" ]
+  WFConnection -> WFReportParams [ headlabel="accept()\land/or                        \lconnect()\l" ]
+
+  WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ]
+  WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ]
+
+    WFReportParams -> WFBitMapS
+    WFReportParams -> WFBitMapT
+    WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false]
+
+      WFBitMapS -> cluster_resync:S
+      WFSyncUUID -> cluster_resync:T
+
+  edge [color=green]
+  cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ]
+
+  edge [color=red]
+  WFReportParams -> CommTrouble
+  Connected -> CommTrouble
+  cluster_resync:any -> CommTrouble
+  edge [color=black]
+  CommTrouble -> Unconnected [label="receiver thread is stopped" ]
+
+}
diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot
new file mode 100644 (file)
index 0000000..4a2b00c
--- /dev/null
@@ -0,0 +1,14 @@
+digraph node_states {
+       Secondary -> Primary           [ label = "ioctl_set_state()" ]
+       Primary   -> Secondary         [ label = "ioctl_set_state()" ]
+}
+
+digraph peer_states {
+       Secondary -> Primary           [ label = "recv state packet" ]
+       Primary   -> Secondary         [ label = "recv state packet" ]
+       Primary   -> Unknown           [ label = "connection lost" ]
+       Secondary  -> Unknown          [ label = "connection lost" ]
+       Unknown   -> Primary           [ label = "connected" ]
+       Unknown   -> Secondary         [ label = "connected" ]
+}
+
index 4f96ac8..6990f49 100644 (file)
@@ -1819,6 +1819,19 @@ S:       Maintained
 F:     drivers/scsi/dpt*
 F:     drivers/scsi/dpt/
 
+DRBD DRIVER
+P:     Philipp Reisner
+P:     Lars Ellenberg
+M:     drbd-dev@lists.linbit.com
+L:     drbd-user@lists.linbit.com
+W:     http://www.drbd.org
+T:     git git://git.drbd.org/linux-2.6-drbd.git drbd
+T:     git git://git.drbd.org/drbd-8.3.git
+S:     Supported
+F:     drivers/block/drbd/
+F:     lib/lru_cache.c
+F:     Documentation/blockdev/drbd/
+
 DRIVER CORE, KOBJECTS, AND SYSFS
 M:     Greg Kroah-Hartman <gregkh@suse.de>
 T:     quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
index b686cc7..01d71e1 100644 (file)
@@ -9,6 +9,7 @@
 #define flush_cache_dup_mm(mm)                 do { } while (0)
 #define flush_cache_range(vma, start, end)     do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)     do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do { } while (0)
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
index 3d0cdd2..e8d4583 100644 (file)
@@ -408,6 +408,7 @@ extern void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
  * about to change to user space.  This is the same method as used on SPARC64.
  * See update_mmu_cache for the user space part.
  */
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *);
 
 extern void __flush_dcache_page(struct address_space *mapping, struct page *page);
index 6706747..96e5382 100644 (file)
@@ -107,6 +107,7 @@ extern void flush_icache_page(struct vm_area_struct *vma, struct page *page);
  * do something here, but only for certain configurations.  No such
  * configurations exist at this time.
  */
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do { } while (0)
 #define flush_dcache_mmap_lock(page)           do { } while (0)
 #define flush_dcache_mmap_unlock(page)         do { } while (0)
index af03a36..417eaac 100644 (file)
@@ -68,9 +68,11 @@ do { memcpy(dst, src, len);                                          \
 #endif
 #if defined(CONFIG_BFIN_EXTMEM_WRITEBACK) || defined(CONFIG_BFIN_L2_WRITEBACK)
 # define flush_dcache_range(start,end)         blackfin_dcache_flush_range((start), (end))
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 # define flush_dcache_page(page)               blackfin_dflush_page(page_address(page))
 #else
 # define flush_dcache_range(start,end)         do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 # define flush_dcache_page(page)               do { } while (0)
 #endif
 
index cf60e3f..36795bc 100644 (file)
@@ -12,6 +12,7 @@
 #define flush_cache_dup_mm(mm)                 do { } while (0)
 #define flush_cache_range(vma, start, end)     do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)     do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do { } while (0)
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
index 432a69e..edbac54 100644 (file)
@@ -47,6 +47,7 @@ static inline void __flush_cache_all(void)
 }
 
 /* dcache/icache coherency... */
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #ifdef CONFIG_MMU
 extern void flush_dcache_page(struct page *page);
 #else
index 5ffdca2..4cf2df2 100644 (file)
@@ -15,6 +15,7 @@
 #define        flush_cache_dup_mm(mm)          do { } while (0)
 #define        flush_cache_range(vma,a,b)
 #define        flush_cache_page(vma,p,pfn)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define        flush_dcache_page(page)
 #define        flush_dcache_mmap_lock(mapping)
 #define        flush_dcache_mmap_unlock(mapping)
index c8ce271..429eefc 100644 (file)
@@ -25,6 +25,7 @@
 #define flush_cache_vmap(start, end)           do { } while (0)
 #define flush_cache_vunmap(start, end)         do { } while (0)
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)                        \
 do {                                           \
        clear_bit(PG_arch_1, &(page)->flags);   \
index 78587c9..8e8e045 100644 (file)
@@ -12,6 +12,7 @@ extern void _flush_cache_copyback_all(void);
 #define flush_cache_dup_mm(mm)                 do { } while (0)
 #define flush_cache_range(vma, start, end)     do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)     do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do { } while (0)
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
@@ -33,6 +34,7 @@ extern void smp_flush_cache_all(void);
 #define flush_cache_dup_mm(mm)                 do { } while (0)
 #define flush_cache_range(vma, start, end)     do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)     do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do { } while (0)
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
@@ -46,6 +48,7 @@ extern void smp_flush_cache_all(void);
 #define flush_cache_dup_mm(mm)                 do { } while (0)
 #define flush_cache_range(vma, start, end)     do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)     do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do { } while (0)
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
index 16bf375..73de7c8 100644 (file)
@@ -128,6 +128,7 @@ static inline void __flush_page_to_ram(void *vaddr)
        }
 }
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)                __flush_page_to_ram(page_address(page))
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
index c65f00a..89f1956 100644 (file)
@@ -12,6 +12,7 @@
 #define flush_cache_range(vma, start, end)     __flush_cache_all()
 #define flush_cache_page(vma, vmaddr)          do { } while (0)
 #define flush_dcache_range(start,len)          __flush_cache_all()
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do { } while (0)
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
index f989d6a..088076e 100644 (file)
@@ -37,6 +37,7 @@
 #define flush_cache_page(vma, vmaddr, pfn)     do { } while (0)
 
 #define flush_dcache_range(start, end) __invalidate_dcache_range(start, end)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do { } while (0)
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
index 03b1d69..40bb9fd 100644 (file)
@@ -38,6 +38,7 @@ extern void (*flush_cache_range)(struct vm_area_struct *vma,
 extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn);
 extern void __flush_dcache_page(struct page *page);
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 static inline void flush_dcache_page(struct page *page)
 {
        if (cpu_has_dc_aliases || !cpu_has_ic_fills_f_dc)
index 1a55d61..29e692f 100644 (file)
@@ -26,6 +26,7 @@
 #define flush_cache_page(vma, vmaddr, pfn)     do {} while (0)
 #define flush_cache_vmap(start, end)           do {} while (0)
 #define flush_cache_vunmap(start, end)         do {} while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do {} while (0)
 #define flush_dcache_mmap_lock(mapping)                do {} while (0)
 #define flush_dcache_mmap_unlock(mapping)      do {} while (0)
index 7243951..7a73b61 100644 (file)
@@ -42,6 +42,7 @@ void flush_cache_mm(struct mm_struct *mm);
 #define flush_cache_vmap(start, end)           flush_cache_all()
 #define flush_cache_vunmap(start, end)         flush_cache_all()
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 
 #define flush_dcache_mmap_lock(mapping) \
index ba667a3..ab9e402 100644 (file)
@@ -25,6 +25,7 @@
 #define flush_cache_vmap(start, end)           do { } while (0)
 #define flush_cache_vunmap(start, end)         do { } while (0)
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
index 49d5af9..405cc97 100644 (file)
@@ -10,6 +10,7 @@
 #define flush_cache_dup_mm(mm)                 do { } while (0)
 #define flush_cache_range(vma, start, end)     do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)     do { } while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do { } while (0)
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
index 07cc8fc..caaba24 100644 (file)
@@ -16,6 +16,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void flush_dcache_range(unsigned long start, unsigned long end);
 
 #define flush_cache_dup_mm(mm)                 do {} while (0)
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)                        do {} while (0)
 #define flush_dcache_mmap_lock(mapping)                do {} while (0)
 #define flush_dcache_mmap_unlock(mapping)      do {} while (0)
index c29918f..dda96eb 100644 (file)
@@ -42,6 +42,7 @@ extern void flush_cache_page(struct vm_area_struct *vma,
                                unsigned long addr, unsigned long pfn);
 extern void flush_cache_range(struct vm_area_struct *vma,
                                 unsigned long start, unsigned long end);
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void flush_icache_page(struct vm_area_struct *vma,
index 68ac109..2e46877 100644 (file)
@@ -75,6 +75,7 @@ BTFIXUPDEF_CALL(void, flush_sig_insns, struct mm_struct *, unsigned long)
 
 extern void sparc_flush_page_to_ram(struct page *page);
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 #define flush_dcache_page(page)                        sparc_flush_page_to_ram(page)
 #define flush_dcache_mmap_lock(mapping)                do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
index c433217..b953840 100644 (file)
@@ -37,6 +37,7 @@ extern void flush_dcache_page_all(struct mm_struct *mm, struct page *page);
 #endif
 
 extern void __flush_dcache_range(unsigned long start, unsigned long end);
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *page);
 
 #define flush_icache_page(vma, pg)     do { } while(0)
index b54f6af..9076add 100644 (file)
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
                                     unsigned long start, unsigned long end) { }
 static inline void flush_cache_page(struct vm_area_struct *vma,
                                    unsigned long vmaddr, unsigned long pfn) { }
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 static inline void flush_dcache_page(struct page *page) { }
 static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
 static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
index b7b8fbe..a508f2f 100644 (file)
@@ -101,6 +101,7 @@ static inline void __invalidate_icache_page_alias(unsigned long virt,
 #define flush_cache_vmap(start,end)    flush_cache_all()
 #define flush_cache_vunmap(start,end)  flush_cache_all()
 
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page*);
 extern void flush_cache_range(struct vm_area_struct*, ulong, ulong);
 extern void flush_cache_page(struct vm_area_struct*, unsigned long, unsigned long);
index 7e803fc..8bd1051 100644 (file)
@@ -12,24 +12,14 @@ config IOSCHED_NOOP
          that do their own scheduling and require only minimal assistance from
          the kernel.
 
-config IOSCHED_AS
-       tristate "Anticipatory I/O scheduler"
-       default y
-       ---help---
-         The anticipatory I/O scheduler is generally a good choice for most
-         environments, but is quite large and complex when compared to the
-         deadline I/O scheduler, it can also be slower in some cases
-         especially some database loads.
-
 config IOSCHED_DEADLINE
        tristate "Deadline I/O scheduler"
        default y
        ---help---
-         The deadline I/O scheduler is simple and compact, and is often as
-         good as the anticipatory I/O scheduler, and in some database
-         workloads, better. In the case of a single process performing I/O to
-         a disk at any one time, its behaviour is almost identical to the
-         anticipatory I/O scheduler and so is a good choice.
+         The deadline I/O scheduler is simple and compact. It will provide
+         CSCAN service with FIFO expiration of requests, switching to
+         a new point in the service tree and doing a batch of IO from there
+         in case of expiry.
 
 config IOSCHED_CFQ
        tristate "CFQ I/O scheduler"
@@ -37,7 +27,9 @@ config IOSCHED_CFQ
        ---help---
          The CFQ I/O scheduler tries to distribute bandwidth equally
          among all processes in the system. It should provide a fair
-         working environment, suitable for desktop systems.
+         and low latency working environment, suitable for both desktop
+         and server systems.
+
          This is the default I/O scheduler.
 
 choice
@@ -47,9 +39,6 @@ choice
          Select the I/O scheduler which will be used by default for all
          block devices.
 
-       config DEFAULT_AS
-               bool "Anticipatory" if IOSCHED_AS=y
-
        config DEFAULT_DEADLINE
                bool "Deadline" if IOSCHED_DEADLINE=y
 
@@ -63,7 +52,6 @@ endchoice
 
 config DEFAULT_IOSCHED
        string
-       default "anticipatory" if DEFAULT_AS
        default "deadline" if DEFAULT_DEADLINE
        default "cfq" if DEFAULT_CFQ
        default "noop" if DEFAULT_NOOP
index ba74ca6..7914108 100644 (file)
@@ -9,7 +9,6 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 
 obj-$(CONFIG_BLK_DEV_BSG)      += bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)     += noop-iosched.o
-obj-$(CONFIG_IOSCHED_AS)       += as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)      += cfq-iosched.o
 
diff --git a/block/as-iosched.c b/block/as-iosched.c
deleted file mode 100644 (file)
index ce8ba57..0000000
+++ /dev/null
@@ -1,1520 +0,0 @@
-/*
- *  Anticipatory & deadline i/o scheduler.
- *
- *  Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
- *                     Nick Piggin <nickpiggin@yahoo.com.au>
- *
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/rbtree.h>
-#include <linux/interrupt.h>
-
-/*
- * See Documentation/block/as-iosched.txt
- */
-
-/*
- * max time before a read is submitted.
- */
-#define default_read_expire (HZ / 8)
-
-/*
- * ditto for writes, these limits are not hard, even
- * if the disk is capable of satisfying them.
- */
-#define default_write_expire (HZ / 4)
-
-/*
- * read_batch_expire describes how long we will allow a stream of reads to
- * persist before looking to see whether it is time to switch over to writes.
- */
-#define default_read_batch_expire (HZ / 2)
-
-/*
- * write_batch_expire describes how long we want a stream of writes to run for.
- * This is not a hard limit, but a target we set for the auto-tuning thingy.
- * See, the problem is: we can send a lot of writes to disk cache / TCQ in
- * a short amount of time...
- */
-#define default_write_batch_expire (HZ / 8)
-
-/*
- * max time we may wait to anticipate a read (default around 6ms)
- */
-#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
-
-/*
- * Keep track of up to 20ms thinktimes. We can go as big as we like here,
- * however huge values tend to interfere and not decay fast enough. A program
- * might be in a non-io phase of operation. Waiting on user input for example,
- * or doing a lengthy computation. A small penalty can be justified there, and
- * will still catch out those processes that constantly have large thinktimes.
- */
-#define MAX_THINKTIME (HZ/50UL)
-
-/* Bits in as_io_context.state */
-enum as_io_states {
-       AS_TASK_RUNNING=0,      /* Process has not exited */
-       AS_TASK_IOSTARTED,      /* Process has started some IO */
-       AS_TASK_IORUNNING,      /* Process has completed some IO */
-};
-
-enum anticipation_status {
-       ANTIC_OFF=0,            /* Not anticipating (normal operation)  */
-       ANTIC_WAIT_REQ,         /* The last read has not yet completed  */
-       ANTIC_WAIT_NEXT,        /* Currently anticipating a request vs
-                                  last read (which has completed) */
-       ANTIC_FINISHED,         /* Anticipating but have found a candidate
-                                * or timed out */
-};
-
-struct as_data {
-       /*
-        * run time data
-        */
-
-       struct request_queue *q;        /* the "owner" queue */
-
-       /*
-        * requests (as_rq s) are present on both sort_list and fifo_list
-        */
-       struct rb_root sort_list[2];
-       struct list_head fifo_list[2];
-
-       struct request *next_rq[2];     /* next in sort order */
-       sector_t last_sector[2];        /* last SYNC & ASYNC sectors */
-
-       unsigned long exit_prob;        /* probability a task will exit while
-                                          being waited on */
-       unsigned long exit_no_coop;     /* probablility an exited task will
-                                          not be part of a later cooperating
-                                          request */
-       unsigned long new_ttime_total;  /* mean thinktime on new proc */
-       unsigned long new_ttime_mean;
-       u64 new_seek_total;             /* mean seek on new proc */
-       sector_t new_seek_mean;
-
-       unsigned long current_batch_expires;
-       unsigned long last_check_fifo[2];
-       int changed_batch;              /* 1: waiting for old batch to end */
-       int new_batch;                  /* 1: waiting on first read complete */
-       int batch_data_dir;             /* current batch SYNC / ASYNC */
-       int write_batch_count;          /* max # of reqs in a write batch */
-       int current_write_count;        /* how many requests left this batch */
-       int write_batch_idled;          /* has the write batch gone idle? */
-
-       enum anticipation_status antic_status;
-       unsigned long antic_start;      /* jiffies: when it started */
-       struct timer_list antic_timer;  /* anticipatory scheduling timer */
-       struct work_struct antic_work;  /* Deferred unplugging */
-       struct io_context *io_context;  /* Identify the expected process */
-       int ioc_finished; /* IO associated with io_context is finished */
-       int nr_dispatched;
-
-       /*
-        * settings that change how the i/o scheduler behaves
-        */
-       unsigned long fifo_expire[2];
-       unsigned long batch_expire[2];
-       unsigned long antic_expire;
-};
-
-/*
- * per-request data.
- */
-enum arq_state {
-       AS_RQ_NEW=0,            /* New - not referenced and not on any lists */
-       AS_RQ_QUEUED,           /* In the request queue. It belongs to the
-                                  scheduler */
-       AS_RQ_DISPATCHED,       /* On the dispatch list. It belongs to the
-                                  driver now */
-       AS_RQ_PRESCHED,         /* Debug poisoning for requests being used */
-       AS_RQ_REMOVED,
-       AS_RQ_MERGED,
-       AS_RQ_POSTSCHED,        /* when they shouldn't be */
-};
-
-#define RQ_IOC(rq)     ((struct io_context *) (rq)->elevator_private)
-#define RQ_STATE(rq)   ((enum arq_state)(rq)->elevator_private2)
-#define RQ_SET_STATE(rq, state)        ((rq)->elevator_private2 = (void *) state)
-
-static DEFINE_PER_CPU(unsigned long, as_ioc_count);
-static struct completion *ioc_gone;
-static DEFINE_SPINLOCK(ioc_gone_lock);
-
-static void as_move_to_dispatch(struct as_data *ad, struct request *rq);
-static void as_antic_stop(struct as_data *ad);
-
-/*
- * IO Context helper functions
- */
-
-/* Called to deallocate the as_io_context */
-static void free_as_io_context(struct as_io_context *aic)
-{
-       kfree(aic);
-       elv_ioc_count_dec(as_ioc_count);
-       if (ioc_gone) {
-               /*
-                * AS scheduler is exiting, grab exit lock and check
-                * the pending io context count. If it hits zero,
-                * complete ioc_gone and set it back to NULL.
-                */
-               spin_lock(&ioc_gone_lock);
-               if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
-                       complete(ioc_gone);
-                       ioc_gone = NULL;
-               }
-               spin_unlock(&ioc_gone_lock);
-       }
-}
-
-static void as_trim(struct io_context *ioc)
-{
-       spin_lock_irq(&ioc->lock);
-       if (ioc->aic)
-               free_as_io_context(ioc->aic);
-       ioc->aic = NULL;
-       spin_unlock_irq(&ioc->lock);
-}
-
-/* Called when the task exits */
-static void exit_as_io_context(struct as_io_context *aic)
-{
-       WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
-       clear_bit(AS_TASK_RUNNING, &aic->state);
-}
-
-static struct as_io_context *alloc_as_io_context(void)
-{
-       struct as_io_context *ret;
-
-       ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
-       if (ret) {
-               ret->dtor = free_as_io_context;
-               ret->exit = exit_as_io_context;
-               ret->state = 1 << AS_TASK_RUNNING;
-               atomic_set(&ret->nr_queued, 0);
-               atomic_set(&ret->nr_dispatched, 0);
-               spin_lock_init(&ret->lock);
-               ret->ttime_total = 0;
-               ret->ttime_samples = 0;
-               ret->ttime_mean = 0;
-               ret->seek_total = 0;
-               ret->seek_samples = 0;
-               ret->seek_mean = 0;
-               elv_ioc_count_inc(as_ioc_count);
-       }
-
-       return ret;
-}
-
-/*
- * If the current task has no AS IO context then create one and initialise it.
- * Then take a ref on the task's io context and return it.
- */
-static struct io_context *as_get_io_context(int node)
-{
-       struct io_context *ioc = get_io_context(GFP_ATOMIC, node);
-       if (ioc && !ioc->aic) {
-               ioc->aic = alloc_as_io_context();
-               if (!ioc->aic) {
-                       put_io_context(ioc);
-                       ioc = NULL;
-               }
-       }
-       return ioc;
-}
-
-static void as_put_io_context(struct request *rq)
-{
-       struct as_io_context *aic;
-
-       if (unlikely(!RQ_IOC(rq)))
-               return;
-
-       aic = RQ_IOC(rq)->aic;
-
-       if (rq_is_sync(rq) && aic) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&aic->lock, flags);
-               set_bit(AS_TASK_IORUNNING, &aic->state);
-               aic->last_end_request = jiffies;
-               spin_unlock_irqrestore(&aic->lock, flags);
-       }
-
-       put_io_context(RQ_IOC(rq));
-}
-
-/*
- * rb tree support functions
- */
-#define RQ_RB_ROOT(ad, rq)     (&(ad)->sort_list[rq_is_sync((rq))])
-
-static void as_add_rq_rb(struct as_data *ad, struct request *rq)
-{
-       struct request *alias;
-
-       while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) {
-               as_move_to_dispatch(ad, alias);
-               as_antic_stop(ad);
-       }
-}
-
-static inline void as_del_rq_rb(struct as_data *ad, struct request *rq)
-{
-       elv_rb_del(RQ_RB_ROOT(ad, rq), rq);
-}
-
-/*
- * IO Scheduler proper
- */
-
-#define MAXBACK (1024 * 1024)  /*
-                                * Maximum distance the disk will go backward
-                                * for a request.
-                                */
-
-#define BACK_PENALTY   2
-
-/*
- * as_choose_req selects the preferred one of two requests of the same data_dir
- * ignoring time - eg. timeouts, which is the job of as_dispatch_request
- */
-static struct request *
-as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
-{
-       int data_dir;
-       sector_t last, s1, s2, d1, d2;
-       int r1_wrap=0, r2_wrap=0;       /* requests are behind the disk head */
-       const sector_t maxback = MAXBACK;
-
-       if (rq1 == NULL || rq1 == rq2)
-               return rq2;
-       if (rq2 == NULL)
-               return rq1;
-
-       data_dir = rq_is_sync(rq1);
-
-       last = ad->last_sector[data_dir];
-       s1 = blk_rq_pos(rq1);
-       s2 = blk_rq_pos(rq2);
-
-       BUG_ON(data_dir != rq_is_sync(rq2));
-
-       /*
-        * Strict one way elevator _except_ in the case where we allow
-        * short backward seeks which are biased as twice the cost of a
-        * similar forward seek.
-        */
-       if (s1 >= last)
-               d1 = s1 - last;
-       else if (s1+maxback >= last)
-               d1 = (last - s1)*BACK_PENALTY;
-       else {
-               r1_wrap = 1;
-               d1 = 0; /* shut up, gcc */
-       }
-
-       if (s2 >= last)
-               d2 = s2 - last;
-       else if (s2+maxback >= last)
-               d2 = (last - s2)*BACK_PENALTY;
-       else {
-               r2_wrap = 1;
-               d2 = 0;
-       }
-
-       /* Found required data */
-       if (!r1_wrap && r2_wrap)
-               return rq1;
-       else if (!r2_wrap && r1_wrap)
-               return rq2;
-       else if (r1_wrap && r2_wrap) {
-               /* both behind the head */
-               if (s1 <= s2)
-                       return rq1;
-               else
-                       return rq2;
-       }
-
-       /* Both requests in front of the head */
-       if (d1 < d2)
-               return rq1;
-       else if (d2 < d1)
-               return rq2;
-       else {
-               if (s1 >= s2)
-                       return rq1;
-               else
-                       return rq2;
-       }
-}
-
-/*
- * as_find_next_rq finds the next request after @prev in elevator order.
- * this with as_choose_req form the basis for how the scheduler chooses
- * what request to process next. Anticipation works on top of this.
- */
-static struct request *
-as_find_next_rq(struct as_data *ad, struct request *last)
-{
-       struct rb_node *rbnext = rb_next(&last->rb_node);
-       struct rb_node *rbprev = rb_prev(&last->rb_node);
-       struct request *next = NULL, *prev = NULL;
-
-       BUG_ON(RB_EMPTY_NODE(&last->rb_node));
-
-       if (rbprev)
-               prev = rb_entry_rq(rbprev);
-
-       if (rbnext)
-               next = rb_entry_rq(rbnext);
-       else {
-               const int data_dir = rq_is_sync(last);
-
-               rbnext = rb_first(&ad->sort_list[data_dir]);
-               if (rbnext && rbnext != &last->rb_node)
-                       next = rb_entry_rq(rbnext);
-       }
-
-       return as_choose_req(ad, next, prev);
-}
-
-/*
- * anticipatory scheduling functions follow
- */
-
-/*
- * as_antic_expired tells us when we have anticipated too long.
- * The funny "absolute difference" math on the elapsed time is to handle
- * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
- */
-static int as_antic_expired(struct as_data *ad)
-{
-       long delta_jif;
-
-       delta_jif = jiffies - ad->antic_start;
-       if (unlikely(delta_jif < 0))
-               delta_jif = -delta_jif;
-       if (delta_jif < ad->antic_expire)
-               return 0;
-
-       return 1;
-}
-
-/*
- * as_antic_waitnext starts anticipating that a nice request will soon be
- * submitted. See also as_antic_waitreq
- */
-static void as_antic_waitnext(struct as_data *ad)
-{
-       unsigned long timeout;
-
-       BUG_ON(ad->antic_status != ANTIC_OFF
-                       && ad->antic_status != ANTIC_WAIT_REQ);
-
-       timeout = ad->antic_start + ad->antic_expire;
-
-       mod_timer(&ad->antic_timer, timeout);
-
-       ad->antic_status = ANTIC_WAIT_NEXT;
-}
-
-/*
- * as_antic_waitreq starts anticipating. We don't start timing the anticipation
- * until the request that we're anticipating on has finished. This means we
- * are timing from when the candidate process wakes up hopefully.
- */
-static void as_antic_waitreq(struct as_data *ad)
-{
-       BUG_ON(ad->antic_status == ANTIC_FINISHED);
-       if (ad->antic_status == ANTIC_OFF) {
-               if (!ad->io_context || ad->ioc_finished)
-                       as_antic_waitnext(ad);
-               else
-                       ad->antic_status = ANTIC_WAIT_REQ;
-       }
-}
-
-/*
- * This is called directly by the functions in this file to stop anticipation.
- * We kill the timer and schedule a call to the request_fn asap.
- */
-static void as_antic_stop(struct as_data *ad)
-{
-       int status = ad->antic_status;
-
-       if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
-               if (status == ANTIC_WAIT_NEXT)
-                       del_timer(&ad->antic_timer);
-               ad->antic_status = ANTIC_FINISHED;
-               /* see as_work_handler */
-               kblockd_schedule_work(ad->q, &ad->antic_work);
-       }
-}
-
-/*
- * as_antic_timeout is the timer function set by as_antic_waitnext.
- */
-static void as_antic_timeout(unsigned long data)
-{
-       struct request_queue *q = (struct request_queue *)data;
-       struct as_data *ad = q->elevator->elevator_data;
-       unsigned long flags;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       if (ad->antic_status == ANTIC_WAIT_REQ
-                       || ad->antic_status == ANTIC_WAIT_NEXT) {
-               struct as_io_context *aic;
-               spin_lock(&ad->io_context->lock);
-               aic = ad->io_context->aic;
-
-               ad->antic_status = ANTIC_FINISHED;
-               kblockd_schedule_work(q, &ad->antic_work);
-
-               if (aic->ttime_samples == 0) {
-                       /* process anticipated on has exited or timed out*/
-                       ad->exit_prob = (7*ad->exit_prob + 256)/8;
-               }
-               if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
-                       /* process not "saved" by a cooperating request */
-                       ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
-               }
-               spin_unlock(&ad->io_context->lock);
-       }
-       spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic,
-                               unsigned long ttime)
-{
-       /* fixed point: 1.0 == 1<<8 */
-       if (aic->ttime_samples == 0) {
-               ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
-               ad->new_ttime_mean = ad->new_ttime_total / 256;
-
-               ad->exit_prob = (7*ad->exit_prob)/8;
-       }
-       aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
-       aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
-       aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
-}
-
-static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic,
-                               sector_t sdist)
-{
-       u64 total;
-
-       if (aic->seek_samples == 0) {
-               ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
-               ad->new_seek_mean = ad->new_seek_total / 256;
-       }
-
-       /*
-        * Don't allow the seek distance to get too large from the
-        * odd fragment, pagein, etc
-        */
-       if (aic->seek_samples <= 60) /* second&third seek */
-               sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
-       else
-               sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64);
-
-       aic->seek_samples = (7*aic->seek_samples + 256) / 8;
-       aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
-       total = aic->seek_total + (aic->seek_samples/2);
-       do_div(total, aic->seek_samples);
-       aic->seek_mean = (sector_t)total;
-}
-
-/*
- * as_update_iohist keeps a decaying histogram of IO thinktimes, and
- * updates @aic->ttime_mean based on that. It is called when a new
- * request is queued.
- */
-static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
-                               struct request *rq)
-{
-       int data_dir = rq_is_sync(rq);
-       unsigned long thinktime = 0;
-       sector_t seek_dist;
-
-       if (aic == NULL)
-               return;
-
-       if (data_dir == BLK_RW_SYNC) {
-               unsigned long in_flight = atomic_read(&aic->nr_queued)
-                                       + atomic_read(&aic->nr_dispatched);
-               spin_lock(&aic->lock);
-               if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
-                       test_bit(AS_TASK_IOSTARTED, &aic->state)) {
-                       /* Calculate read -> read thinktime */
-                       if (test_bit(AS_TASK_IORUNNING, &aic->state)
-                                                       && in_flight == 0) {
-                               thinktime = jiffies - aic->last_end_request;
-                               thinktime = min(thinktime, MAX_THINKTIME-1);
-                       }
-                       as_update_thinktime(ad, aic, thinktime);
-
-                       /* Calculate read -> read seek distance */
-                       if (aic->last_request_pos < blk_rq_pos(rq))
-                               seek_dist = blk_rq_pos(rq) -
-                                           aic->last_request_pos;
-                       else
-                               seek_dist = aic->last_request_pos -
-                                           blk_rq_pos(rq);
-                       as_update_seekdist(ad, aic, seek_dist);
-               }
-               aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
-               set_bit(AS_TASK_IOSTARTED, &aic->state);
-               spin_unlock(&aic->lock);
-       }
-}
-
-/*
- * as_close_req decides if one request is considered "close" to the
- * previous one issued.
- */
-static int as_close_req(struct as_data *ad, struct as_io_context *aic,
-                       struct request *rq)
-{
-       unsigned long delay;    /* jiffies */
-       sector_t last = ad->last_sector[ad->batch_data_dir];
-       sector_t next = blk_rq_pos(rq);
-       sector_t delta; /* acceptable close offset (in sectors) */
-       sector_t s;
-
-       if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
-               delay = 0;
-       else
-               delay = jiffies - ad->antic_start;
-
-       if (delay == 0)
-               delta = 8192;
-       else if (delay <= (20 * HZ / 1000) && delay <= ad->antic_expire)
-               delta = 8192 << delay;
-       else
-               return 1;
-
-       if ((last <= next + (delta>>1)) && (next <= last + delta))
-               return 1;
-
-       if (last < next)
-               s = next - last;
-       else
-               s = last - next;
-
-       if (aic->seek_samples == 0) {
-               /*
-                * Process has just started IO. Use past statistics to
-                * gauge success possibility
-                */
-               if (ad->new_seek_mean > s) {
-                       /* this request is better than what we're expecting */
-                       return 1;
-               }
-
-       } else {
-               if (aic->seek_mean > s) {
-                       /* this request is better than what we're expecting */
-                       return 1;
-               }
-       }
-
-       return 0;
-}
-
-/*
- * as_can_break_anticipation returns true if we have been anticipating this
- * request.
- *
- * It also returns true if the process against which we are anticipating
- * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
- * dispatch it ASAP, because we know that application will not be submitting
- * any new reads.
- *
- * If the task which has submitted the request has exited, break anticipation.
- *
- * If this task has queued some other IO, do not enter enticipation.
- */
-static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
-{
-       struct io_context *ioc;
-       struct as_io_context *aic;
-
-       ioc = ad->io_context;
-       BUG_ON(!ioc);
-       spin_lock(&ioc->lock);
-
-       if (rq && ioc == RQ_IOC(rq)) {
-               /* request from same process */
-               spin_unlock(&ioc->lock);
-               return 1;
-       }
-
-       if (ad->ioc_finished && as_antic_expired(ad)) {
-               /*
-                * In this situation status should really be FINISHED,
-                * however the timer hasn't had the chance to run yet.
-                */
-               spin_unlock(&ioc->lock);
-               return 1;
-       }
-
-       aic = ioc->aic;
-       if (!aic) {
-               spin_unlock(&ioc->lock);
-               return 0;
-       }
-
-       if (atomic_read(&aic->nr_queued) > 0) {
-               /* process has more requests queued */
-               spin_unlock(&ioc->lock);
-               return 1;
-       }
-
-       if (atomic_read(&aic->nr_dispatched) > 0) {
-               /* process has more requests dispatched */
-               spin_unlock(&ioc->lock);
-               return 1;
-       }
-
-       if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) {
-               /*
-                * Found a close request that is not one of ours.
-                *
-                * This makes close requests from another process update
-                * our IO history. Is generally useful when there are
-                * two or more cooperating processes working in the same
-                * area.
-                */
-               if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
-                       if (aic->ttime_samples == 0)
-                               ad->exit_prob = (7*ad->exit_prob + 256)/8;
-
-                       ad->exit_no_coop = (7*ad->exit_no_coop)/8;
-               }
-
-               as_update_iohist(ad, aic, rq);
-               spin_unlock(&ioc->lock);
-               return 1;
-       }
-
-       if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
-               /* process anticipated on has exited */
-               if (aic->ttime_samples == 0)
-                       ad->exit_prob = (7*ad->exit_prob + 256)/8;
-
-               if (ad->exit_no_coop > 128) {
-                       spin_unlock(&ioc->lock);
-                       return 1;
-               }
-       }
-
-       if (aic->ttime_samples == 0) {
-               if (ad->new_ttime_mean > ad->antic_expire) {
-                       spin_unlock(&ioc->lock);
-                       return 1;
-               }
-               if (ad->exit_prob * ad->exit_no_coop > 128*256) {
-                       spin_unlock(&ioc->lock);
-                       return 1;
-               }
-       } else if (aic->ttime_mean > ad->antic_expire) {
-               /* the process thinks too much between requests */
-               spin_unlock(&ioc->lock);
-               return 1;
-       }
-       spin_unlock(&ioc->lock);
-       return 0;
-}
-
-/*
- * as_can_anticipate indicates whether we should either run rq
- * or keep anticipating a better request.
- */
-static int as_can_anticipate(struct as_data *ad, struct request *rq)
-{
-#if 0 /* disable for now, we need to check tag level as well */
-       /*
-        * SSD device without seek penalty, disable idling
-        */
-       if (blk_queue_nonrot(ad->q)) axman
-               return 0;
-#endif
-
-       if (!ad->io_context)
-               /*
-                * Last request submitted was a write
-                */
-               return 0;
-
-       if (ad->antic_status == ANTIC_FINISHED)
-               /*
-                * Don't restart if we have just finished. Run the next request
-                */
-               return 0;
-
-       if (as_can_break_anticipation(ad, rq))
-               /*
-                * This request is a good candidate. Don't keep anticipating,
-                * run it.
-                */
-               return 0;
-
-       /*
-        * OK from here, we haven't finished, and don't have a decent request!
-        * Status is either ANTIC_OFF so start waiting,
-        * ANTIC_WAIT_REQ so continue waiting for request to finish
-        * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
-        */
-
-       return 1;
-}
-
-/*
- * as_update_rq must be called whenever a request (rq) is added to
- * the sort_list. This function keeps caches up to date, and checks if the
- * request might be one we are "anticipating"
- */
-static void as_update_rq(struct as_data *ad, struct request *rq)
-{
-       const int data_dir = rq_is_sync(rq);
-
-       /* keep the next_rq cache up to date */
-       ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);
-
-       /*
-        * have we been anticipating this request?
-        * or does it come from the same process as the one we are anticipating
-        * for?
-        */
-       if (ad->antic_status == ANTIC_WAIT_REQ
-                       || ad->antic_status == ANTIC_WAIT_NEXT) {
-               if (as_can_break_anticipation(ad, rq))
-                       as_antic_stop(ad);
-       }
-}
-
-/*
- * Gathers timings and resizes the write batch automatically
- */
-static void update_write_batch(struct as_data *ad)
-{
-       unsigned long batch = ad->batch_expire[BLK_RW_ASYNC];
-       long write_time;
-
-       write_time = (jiffies - ad->current_batch_expires) + batch;
-       if (write_time < 0)
-               write_time = 0;
-
-       if (write_time > batch && !ad->write_batch_idled) {
-               if (write_time > batch * 3)
-                       ad->write_batch_count /= 2;
-               else
-                       ad->write_batch_count--;
-       } else if (write_time < batch && ad->current_write_count == 0) {
-               if (batch > write_time * 3)
-                       ad->write_batch_count *= 2;
-               else
-                       ad->write_batch_count++;
-       }
-
-       if (ad->write_batch_count < 1)
-               ad->write_batch_count = 1;
-}
-
-/*
- * as_completed_request is to be called when a request has completed and
- * returned something to the requesting process, be it an error or data.
- */
-static void as_completed_request(struct request_queue *q, struct request *rq)
-{
-       struct as_data *ad = q->elevator->elevator_data;
-
-       WARN_ON(!list_empty(&rq->queuelist));
-
-       if (RQ_STATE(rq) != AS_RQ_REMOVED) {
-               WARN(1, "rq->state %d\n", RQ_STATE(rq));
-               goto out;
-       }
-
-       if (ad->changed_batch && ad->nr_dispatched == 1) {
-               ad->current_batch_expires = jiffies +
-                                       ad->batch_expire[ad->batch_data_dir];
-               kblockd_schedule_work(q, &ad->antic_work);
-               ad->changed_batch = 0;
-
-               if (ad->batch_data_dir == BLK_RW_SYNC)
-                       ad->new_batch = 1;
-       }
-       WARN_ON(ad->nr_dispatched == 0);
-       ad->nr_dispatched--;
-
-       /*
-        * Start counting the batch from when a request of that direction is
-        * actually serviced. This should help devices with big TCQ windows
-        * and writeback caches
-        */
-       if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
-               update_write_batch(ad);
-               ad->current_batch_expires = jiffies +
-                               ad->batch_expire[BLK_RW_SYNC];
-               ad->new_batch = 0;
-       }
-
-       if (ad->io_context == RQ_IOC(rq) && ad->io_context) {
-               ad->antic_start = jiffies;
-               ad->ioc_finished = 1;
-               if (ad->antic_status == ANTIC_WAIT_REQ) {
-                       /*
-                        * We were waiting on this request, now anticipate
-                        * the next one
-                        */
-                       as_antic_waitnext(ad);
-               }
-       }
-
-       as_put_io_context(rq);
-out:
-       RQ_SET_STATE(rq, AS_RQ_POSTSCHED);
-}
-
-/*
- * as_remove_queued_request removes a request from the pre dispatch queue
- * without updating refcounts. It is expected the caller will drop the
- * reference unless it replaces the request at somepart of the elevator
- * (ie. the dispatch queue)
- */
-static void as_remove_queued_request(struct request_queue *q,
-                                    struct request *rq)
-{
-       const int data_dir = rq_is_sync(rq);
-       struct as_data *ad = q->elevator->elevator_data;
-       struct io_context *ioc;
-
-       WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
-
-       ioc = RQ_IOC(rq);
-       if (ioc && ioc->aic) {
-               BUG_ON(!atomic_read(&ioc->aic->nr_queued));
-               atomic_dec(&ioc->aic->nr_queued);
-       }
-
-       /*
-        * Update the "next_rq" cache if we are about to remove its
-        * entry
-        */
-       if (ad->next_rq[data_dir] == rq)
-               ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
-
-       rq_fifo_clear(rq);
-       as_del_rq_rb(ad, rq);
-}
-
-/*
- * as_fifo_expired returns 0 if there are no expired requests on the fifo,
- * 1 otherwise.  It is ratelimited so that we only perform the check once per
- * `fifo_expire' interval.  Otherwise a large number of expired requests
- * would create a hopeless seekstorm.
- *
- * See as_antic_expired comment.
- */
-static int as_fifo_expired(struct as_data *ad, int adir)
-{
-       struct request *rq;
-       long delta_jif;
-
-       delta_jif = jiffies - ad->last_check_fifo[adir];
-       if (unlikely(delta_jif < 0))
-               delta_jif = -delta_jif;
-       if (delta_jif < ad->fifo_expire[adir])
-               return 0;
-
-       ad->last_check_fifo[adir] = jiffies;
-
-       if (list_empty(&ad->fifo_list[adir]))
-               return 0;
-
-       rq = rq_entry_fifo(ad->fifo_list[adir].next);
-
-       return time_after(jiffies, rq_fifo_time(rq));
-}
-
-/*
- * as_batch_expired returns true if the current batch has expired. A batch
- * is a set of reads or a set of writes.
- */
-static inline int as_batch_expired(struct as_data *ad)
-{
-       if (ad->changed_batch || ad->new_batch)
-               return 0;
-
-       if (ad->batch_data_dir == BLK_RW_SYNC)
-               /* TODO! add a check so a complete fifo gets written? */
-               return time_after(jiffies, ad->current_batch_expires);
-
-       return time_after(jiffies, ad->current_batch_expires)
-               || ad->current_write_count == 0;
-}
-
-/*
- * move an entry to dispatch queue
- */
-static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
-{
-       const int data_dir = rq_is_sync(rq);
-
-       BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
-
-       as_antic_stop(ad);
-       ad->antic_status = ANTIC_OFF;
-
-       /*
-        * This has to be set in order to be correctly updated by
-        * as_find_next_rq
-        */
-       ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq);
-
-       if (data_dir == BLK_RW_SYNC) {
-               struct io_context *ioc = RQ_IOC(rq);
-               /* In case we have to anticipate after this */
-               copy_io_context(&ad->io_context, &ioc);
-       } else {
-               if (ad->io_context) {
-                       put_io_context(ad->io_context);
-                       ad->io_context = NULL;
-               }
-
-               if (ad->current_write_count != 0)
-                       ad->current_write_count--;
-       }
-       ad->ioc_finished = 0;
-
-       ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
-
-       /*
-        * take it off the sort and fifo list, add to dispatch queue
-        */
-       as_remove_queued_request(ad->q, rq);
-       WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
-
-       elv_dispatch_sort(ad->q, rq);
-
-       RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
-       if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
-               atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
-       ad->nr_dispatched++;
-}
-
-/*
- * as_dispatch_request selects the best request according to
- * read/write expire, batch expire, etc, and moves it to the dispatch
- * queue. Returns 1 if a request was found, 0 otherwise.
- */
-static int as_dispatch_request(struct request_queue *q, int force)
-{
-       struct as_data *ad = q->elevator->elevator_data;
-       const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]);
-       const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]);
-       struct request *rq;
-
-       if (unlikely(force)) {
-               /*
-                * Forced dispatch, accounting is useless.  Reset
-                * accounting states and dump fifo_lists.  Note that
-                * batch_data_dir is reset to BLK_RW_SYNC to avoid
-                * screwing write batch accounting as write batch
-                * accounting occurs on W->R transition.
-                */
-               int dispatched = 0;
-
-               ad->batch_data_dir = BLK_RW_SYNC;
-               ad->changed_batch = 0;
-               ad->new_batch = 0;
-
-               while (ad->next_rq[BLK_RW_SYNC]) {
-                       as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]);
-                       dispatched++;
-               }
-               ad->last_check_fifo[BLK_RW_SYNC] = jiffies;
-
-               while (ad->next_rq[BLK_RW_ASYNC]) {
-                       as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]);
-                       dispatched++;
-               }
-               ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
-
-               return dispatched;
-       }
-
-       /* Signal that the write batch was uncontended, so we can't time it */
-       if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) {
-               if (ad->current_write_count == 0 || !writes)
-                       ad->write_batch_idled = 1;
-       }
-
-       if (!(reads || writes)
-               || ad->antic_status == ANTIC_WAIT_REQ
-               || ad->antic_status == ANTIC_WAIT_NEXT
-               || ad->changed_batch)
-               return 0;
-
-       if (!(reads && writes && as_batch_expired(ad))) {
-               /*
-                * batch is still running or no reads or no writes
-                */
-               rq = ad->next_rq[ad->batch_data_dir];
-
-               if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) {
-                       if (as_fifo_expired(ad, BLK_RW_SYNC))
-                               goto fifo_expired;
-
-                       if (as_can_anticipate(ad, rq)) {
-                               as_antic_waitreq(ad);
-                               return 0;
-                       }
-               }
-
-               if (rq) {
-                       /* we have a "next request" */
-                       if (reads && !writes)
-                               ad->current_batch_expires =
-                                       jiffies + ad->batch_expire[BLK_RW_SYNC];
-                       goto dispatch_request;
-               }
-       }
-
-       /*
-        * at this point we are not running a batch. select the appropriate
-        * data direction (read / write)
-        */
-
-       if (reads) {
-               BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC]));
-
-               if (writes && ad->batch_data_dir == BLK_RW_SYNC)
-                       /*
-                        * Last batch was a read, switch to writes
-                        */
-                       goto dispatch_writes;
-
-               if (ad->batch_data_dir == BLK_RW_ASYNC) {
-                       WARN_ON(ad->new_batch);
-                       ad->changed_batch = 1;
-               }
-               ad->batch_data_dir = BLK_RW_SYNC;
-               rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next);
-               ad->last_check_fifo[ad->batch_data_dir] = jiffies;
-               goto dispatch_request;
-       }
-
-       /*
-        * the last batch was a read
-        */
-
-       if (writes) {
-dispatch_writes:
-               BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC]));
-
-               if (ad->batch_data_dir == BLK_RW_SYNC) {
-                       ad->changed_batch = 1;
-
-                       /*
-                        * new_batch might be 1 when the queue runs out of
-                        * reads. A subsequent submission of a write might
-                        * cause a change of batch before the read is finished.
-                        */
-                       ad->new_batch = 0;
-               }
-               ad->batch_data_dir = BLK_RW_ASYNC;
-               ad->current_write_count = ad->write_batch_count;
-               ad->write_batch_idled = 0;
-               rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next);
-               ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
-               goto dispatch_request;
-       }
-
-       BUG();
-       return 0;
-
-dispatch_request:
-       /*
-        * If a request has expired, service it.
-        */
-
-       if (as_fifo_expired(ad, ad->batch_data_dir)) {
-fifo_expired:
-               rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
-       }
-
-       if (ad->changed_batch) {
-               WARN_ON(ad->new_batch);
-
-               if (ad->nr_dispatched)
-                       return 0;
-
-               if (ad->batch_data_dir == BLK_RW_ASYNC)
-                       ad->current_batch_expires = jiffies +
-                                       ad->batch_expire[BLK_RW_ASYNC];
-               else
-                       ad->new_batch = 1;
-
-               ad->changed_batch = 0;
-       }
-
-       /*
-        * rq is the selected appropriate request.
-        */
-       as_move_to_dispatch(ad, rq);
-
-       return 1;
-}
-
-/*
- * add rq to rbtree and fifo
- */
-static void as_add_request(struct request_queue *q, struct request *rq)
-{
-       struct as_data *ad = q->elevator->elevator_data;
-       int data_dir;
-
-       RQ_SET_STATE(rq, AS_RQ_NEW);
-
-       data_dir = rq_is_sync(rq);
-
-       rq->elevator_private = as_get_io_context(q->node);
-
-       if (RQ_IOC(rq)) {
-               as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
-               atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
-       }
-
-       as_add_rq_rb(ad, rq);
-
-       /*
-        * set expire time and add to fifo list
-        */
-       rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
-       list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);
-
-       as_update_rq(ad, rq); /* keep state machine up to date */
-       RQ_SET_STATE(rq, AS_RQ_QUEUED);
-}
-
-static void as_activate_request(struct request_queue *q, struct request *rq)
-{
-       WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED);
-       RQ_SET_STATE(rq, AS_RQ_REMOVED);
-       if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
-               atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched);
-}
-
-static void as_deactivate_request(struct request_queue *q, struct request *rq)
-{
-       WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED);
-       RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
-       if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
-               atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
-}
-
-/*
- * as_queue_empty tells us if there are requests left in the device. It may
- * not be the case that a driver can get the next request even if the queue
- * is not empty - it is used in the block layer to check for plugging and
- * merging opportunities
- */
-static int as_queue_empty(struct request_queue *q)
-{
-       struct as_data *ad = q->elevator->elevator_data;
-
-       return list_empty(&ad->fifo_list[BLK_RW_ASYNC])
-               && list_empty(&ad->fifo_list[BLK_RW_SYNC]);
-}
-
-static int
-as_merge(struct request_queue *q, struct request **req, struct bio *bio)
-{
-       struct as_data *ad = q->elevator->elevator_data;
-       sector_t rb_key = bio->bi_sector + bio_sectors(bio);
-       struct request *__rq;
-
-       /*
-        * check for front merge
-        */
-       __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key);
-       if (__rq && elv_rq_merge_ok(__rq, bio)) {
-               *req = __rq;
-               return ELEVATOR_FRONT_MERGE;
-       }
-
-       return ELEVATOR_NO_MERGE;
-}
-
-static void as_merged_request(struct request_queue *q, struct request *req,
-                             int type)
-{
-       struct as_data *ad = q->elevator->elevator_data;
-
-       /*
-        * if the merge was a front merge, we need to reposition request
-        */
-       if (type == ELEVATOR_FRONT_MERGE) {
-               as_del_rq_rb(ad, req);
-               as_add_rq_rb(ad, req);
-               /*
-                * Note! At this stage of this and the next function, our next
-                * request may not be optimal - eg the request may have "grown"
-                * behind the disk head. We currently don't bother adjusting.
-                */
-       }
-}
-
-static void as_merged_requests(struct request_queue *q, struct request *req,
-                               struct request *next)
-{
-       /*
-        * if next expires before rq, assign its expire time to arq
-        * and move into next position (next will be deleted) in fifo
-        */
-       if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
-               if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
-                       list_move(&req->queuelist, &next->queuelist);
-                       rq_set_fifo_time(req, rq_fifo_time(next));
-               }
-       }
-
-       /*
-        * kill knowledge of next, this one is a goner
-        */
-       as_remove_queued_request(q, next);
-       as_put_io_context(next);
-
-       RQ_SET_STATE(next, AS_RQ_MERGED);
-}
-
-/*
- * This is executed in a "deferred" process context, by kblockd. It calls the
- * driver's request_fn so the driver can submit that request.
- *
- * IMPORTANT! This guy will reenter the elevator, so set up all queue global
- * state before calling, and don't rely on any state over calls.
- *
- * FIXME! dispatch queue is not a queue at all!
- */
-static void as_work_handler(struct work_struct *work)
-{
-       struct as_data *ad = container_of(work, struct as_data, antic_work);
-
-       blk_run_queue(ad->q);
-}
-
-static int as_may_queue(struct request_queue *q, int rw)
-{
-       int ret = ELV_MQUEUE_MAY;
-       struct as_data *ad = q->elevator->elevator_data;
-       struct io_context *ioc;
-       if (ad->antic_status == ANTIC_WAIT_REQ ||
-                       ad->antic_status == ANTIC_WAIT_NEXT) {
-               ioc = as_get_io_context(q->node);
-               if (ad->io_context == ioc)
-                       ret = ELV_MQUEUE_MUST;
-               put_io_context(ioc);
-       }
-
-       return ret;
-}
-
-static void as_exit_queue(struct elevator_queue *e)
-{
-       struct as_data *ad = e->elevator_data;
-
-       del_timer_sync(&ad->antic_timer);
-       cancel_work_sync(&ad->antic_work);
-
-       BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC]));
-       BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC]));
-
-       put_io_context(ad->io_context);
-       kfree(ad);
-}
-
-/*
- * initialize elevator private data (as_data).
- */
-static void *as_init_queue(struct request_queue *q)
-{
-       struct as_data *ad;
-
-       ad = kmalloc_node(sizeof(*ad), GFP_KERNEL | __GFP_ZERO, q->node);
-       if (!ad)
-               return NULL;
-
-       ad->q = q; /* Identify what queue the data belongs to */
-
-       /* anticipatory scheduling helpers */
-       ad->antic_timer.function = as_antic_timeout;
-       ad->antic_timer.data = (unsigned long)q;
-       init_timer(&ad->antic_timer);
-       INIT_WORK(&ad->antic_work, as_work_handler);
-
-       INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]);
-       INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]);
-       ad->sort_list[BLK_RW_SYNC] = RB_ROOT;
-       ad->sort_list[BLK_RW_ASYNC] = RB_ROOT;
-       ad->fifo_expire[BLK_RW_SYNC] = default_read_expire;
-       ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire;
-       ad->antic_expire = default_antic_expire;
-       ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire;
-       ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire;
-
-       ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC];
-       ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10;
-       if (ad->write_batch_count < 2)
-               ad->write_batch_count = 2;
-
-       return ad;
-}
-
-/*
- * sysfs parts below
- */
-
-static ssize_t
-as_var_show(unsigned int var, char *page)
-{
-       return sprintf(page, "%d\n", var);
-}
-
-static ssize_t
-as_var_store(unsigned long *var, const char *page, size_t count)
-{
-       char *p = (char *) page;
-
-       *var = simple_strtoul(p, &p, 10);
-       return count;
-}
-
-static ssize_t est_time_show(struct elevator_queue *e, char *page)
-{
-       struct as_data *ad = e->elevator_data;
-       int pos = 0;
-
-       pos += sprintf(page+pos, "%lu %% exit probability\n",
-                               100*ad->exit_prob/256);
-       pos += sprintf(page+pos, "%lu %% probability of exiting without a "
-                               "cooperating process submitting IO\n",
-                               100*ad->exit_no_coop/256);
-       pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
-       pos += sprintf(page+pos, "%llu sectors new seek distance\n",
-                               (unsigned long long)ad->new_seek_mean);
-
-       return pos;
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR)                           \
-static ssize_t __FUNC(struct elevator_queue *e, char *page)    \
-{                                                              \
-       struct as_data *ad = e->elevator_data;                  \
-       return as_var_show(jiffies_to_msecs((__VAR)), (page));  \
-}
-SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]);
-SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]);
-SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
-SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]);
-SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]);
-#undef SHOW_FUNCTION
-
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                                \
-static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)        \
-{                                                                      \
-       struct as_data *ad = e->elevator_data;                          \
-       int ret = as_var_store(__PTR, (page), count);                   \
-       if (*(__PTR) < (MIN))                                           \
-               *(__PTR) = (MIN);                                       \
-       else if (*(__PTR) > (MAX))                                      \
-               *(__PTR) = (MAX);                                       \
-       *(__PTR) = msecs_to_jiffies(*(__PTR));                          \
-       return ret;                                                     \
-}
-STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_write_expire_store,
-                       &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX);
-STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
-STORE_FUNCTION(as_read_batch_expire_store,
-                       &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_write_batch_expire_store,
-                       &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX);
-#undef STORE_FUNCTION
-
-#define AS_ATTR(name) \
-       __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
-
-static struct elv_fs_entry as_attrs[] = {
-       __ATTR_RO(est_time),
-       AS_ATTR(read_expire),
-       AS_ATTR(write_expire),
-       AS_ATTR(antic_expire),
-       AS_ATTR(read_batch_expire),
-       AS_ATTR(write_batch_expire),
-       __ATTR_NULL
-};
-
-static struct elevator_type iosched_as = {
-       .ops = {
-               .elevator_merge_fn =            as_merge,
-               .elevator_merged_fn =           as_merged_request,
-               .elevator_merge_req_fn =        as_merged_requests,
-               .elevator_dispatch_fn =         as_dispatch_request,
-               .elevator_add_req_fn =          as_add_request,
-               .elevator_activate_req_fn =     as_activate_request,
-               .elevator_deactivate_req_fn =   as_deactivate_request,
-               .elevator_queue_empty_fn =      as_queue_empty,
-               .elevator_completed_req_fn =    as_completed_request,
-               .elevator_former_req_fn =       elv_rb_former_request,
-               .elevator_latter_req_fn =       elv_rb_latter_request,
-               .elevator_may_queue_fn =        as_may_queue,
-               .elevator_init_fn =             as_init_queue,
-               .elevator_exit_fn =             as_exit_queue,
-               .trim =                         as_trim,
-       },
-
-       .elevator_attrs = as_attrs,
-       .elevator_name = "anticipatory",
-       .elevator_owner = THIS_MODULE,
-};
-
-static int __init as_init(void)
-{
-       elv_register(&iosched_as);
-
-       return 0;
-}
-
-static void __exit as_exit(void)
-{
-       DECLARE_COMPLETION_ONSTACK(all_gone);
-       elv_unregister(&iosched_as);
-       ioc_gone = &all_gone;
-       /* ioc_gone's update must be visible before reading ioc_count */
-       smp_wmb();
-       if (elv_ioc_count_read(as_ioc_count))
-               wait_for_completion(&all_gone);
-       synchronize_rcu();
-}
-
-module_init(as_init);
-module_exit(as_exit);
-
-MODULE_AUTHOR("Nick Piggin");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("anticipatory IO scheduler");
index 71da511..718897e 100644 (file)
@@ -2358,6 +2358,25 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+/**
+ * rq_flush_dcache_pages - Helper function to flush all pages in a request
+ * @rq: the request to be flushed
+ *
+ * Description:
+ *     Flush all pages in @rq.
+ */
+void rq_flush_dcache_pages(struct request *rq)
+{
+       struct req_iterator iter;
+       struct bio_vec *bvec;
+
+       rq_for_each_segment(bvec, rq, iter)
+               flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
+#endif
+
 /**
  * blk_lld_busy - Check if underlying low-level drivers of a device are busy
  * @q : the queue of the device being checked
index 66d4aa8..dd1f1e0 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>     /* for max_pfn/max_low_pfn */
 #include <linux/gcd.h>
+#include <linux/jiffies.h>
 
 #include "blk.h"
 
@@ -96,7 +97,11 @@ void blk_set_default_limits(struct queue_limits *lim)
        lim->max_segment_size = MAX_SEGMENT_SIZE;
        lim->max_sectors = BLK_DEF_MAX_SECTORS;
        lim->max_hw_sectors = INT_MAX;
-       lim->max_discard_sectors = SAFE_MAX_SECTORS;
+       lim->max_discard_sectors = 0;
+       lim->discard_granularity = 0;
+       lim->discard_alignment = 0;
+       lim->discard_misaligned = 0;
+       lim->discard_zeroes_data = -1;
        lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
        lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
        lim->alignment_offset = 0;
@@ -141,7 +146,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
        q->nr_batching = BLK_BATCH_REQ;
 
        q->unplug_thresh = 4;           /* hmm */
-       q->unplug_delay = (3 * HZ) / 1000;      /* 3 milliseconds */
+       q->unplug_delay = msecs_to_jiffies(3);  /* 3 milliseconds */
        if (q->unplug_delay == 0)
                q->unplug_delay = 1;
 
@@ -488,6 +493,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
+static unsigned int lcm(unsigned int a, unsigned int b)
+{
+       if (a && b)
+               return (a * b) / gcd(a, b);
+       else if (b)
+               return b;
+
+       return a;
+}
+
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t: the stacking driver limits (top)
@@ -502,6 +517,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                     sector_t offset)
 {
+       int ret;
+
+       ret = 0;
+
        t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
        t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
        t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -526,12 +545,19 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
        t->io_min = max(t->io_min, b->io_min);
        t->no_cluster |= b->no_cluster;
+       t->discard_zeroes_data &= b->discard_zeroes_data;
 
        /* Bottom device offset aligned? */
        if (offset &&
            (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
                t->misaligned = 1;
-               return -1;
+               ret = -1;
+       }
+
+       if (offset &&
+           (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
+               t->discard_misaligned = 1;
+               ret = -1;
        }
 
        /* If top has no alignment offset, inherit from bottom */
@@ -539,23 +565,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                t->alignment_offset =
                        b->alignment_offset & (b->physical_block_size - 1);
 
+       if (!t->discard_alignment)
+               t->discard_alignment =
+                       b->discard_alignment & (b->discard_granularity - 1);
+
        /* Top device aligned on logical block boundary? */
        if (t->alignment_offset & (t->logical_block_size - 1)) {
                t->misaligned = 1;
-               return -1;
+               ret = -1;
        }
 
-       /* Find lcm() of optimal I/O size */
-       if (t->io_opt && b->io_opt)
-               t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt);
-       else if (b->io_opt)
-               t->io_opt = b->io_opt;
+       /* Find lcm() of optimal I/O size and granularity */
+       t->io_opt = lcm(t->io_opt, b->io_opt);
+       t->discard_granularity = lcm(t->discard_granularity,
+                                    b->discard_granularity);
 
        /* Verify that optimal I/O size is a multiple of io_min */
        if (t->io_min && t->io_opt % t->io_min)
-               return -1;
+               ret = -1;
 
-       return 0;
+       return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
index 8a6d81a..8606c95 100644 (file)
@@ -126,6 +126,21 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
        return queue_var_show(queue_io_opt(q), page);
 }
 
+static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
+{
+       return queue_var_show(q->limits.discard_granularity, page);
+}
+
+static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
+{
+       return queue_var_show(q->limits.max_discard_sectors << 9, page);
+}
+
+static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
+{
+       return queue_var_show(queue_discard_zeroes_data(q), page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -293,6 +308,21 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
        .show = queue_io_opt_show,
 };
 
+static struct queue_sysfs_entry queue_discard_granularity_entry = {
+       .attr = {.name = "discard_granularity", .mode = S_IRUGO },
+       .show = queue_discard_granularity_show,
+};
+
+static struct queue_sysfs_entry queue_discard_max_entry = {
+       .attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+       .show = queue_discard_max_show,
+};
+
+static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
+       .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
+       .show = queue_discard_zeroes_data_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
        .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
        .show = queue_nonrot_show,
@@ -328,6 +358,9 @@ static struct attribute *default_attrs[] = {
        &queue_physical_block_size_entry.attr,
        &queue_io_min_entry.attr,
        &queue_io_opt_entry.attr,
+       &queue_discard_granularity_entry.attr,
+       &queue_discard_max_entry.attr,
+       &queue_discard_zeroes_data_entry.attr,
        &queue_nonrot_entry.attr,
        &queue_nomerges_entry.attr,
        &queue_rq_affinity_entry.attr,
index 0676301..a9fd2d8 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/blkdev.h>
 #include <linux/poll.h>
 #include <linux/cdev.h>
+#include <linux/jiffies.h>
 #include <linux/percpu.h>
 #include <linux/uio.h>
 #include <linux/idr.h>
@@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
        rq->cmd_len = hdr->request_len;
        rq->cmd_type = REQ_TYPE_BLOCK_PC;
 
-       rq->timeout = (hdr->timeout * HZ) / 1000;
+       rq->timeout = msecs_to_jiffies(hdr->timeout);
        if (!rq->timeout)
                rq->timeout = q->sg_timeout;
        if (!rq->timeout)
index aa1e953..f5b59e1 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
+#include <linux/jiffies.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
@@ -27,6 +28,8 @@ static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
+static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
+static const int cfq_hist_divisor = 4;
 
 /*
  * offset from end of service tree
@@ -38,6 +41,12 @@ static int cfq_slice_idle = HZ / 125;
  */
 #define CFQ_MIN_TT             (2)
 
+/*
+ * Allow merged cfqqs to perform this amount of seeky I/O before
+ * deciding to break the queues up again.
+ */
+#define CFQQ_COOP_TOUT         (HZ)
+
 #define CFQ_SLICE_SCALE                (5)
 #define CFQ_HW_QUEUE_MIN       (5)
 
@@ -67,8 +76,9 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
 struct cfq_rb_root {
        struct rb_root rb;
        struct rb_node *left;
+       unsigned count;
 };
-#define CFQ_RB_ROOT    (struct cfq_rb_root) { RB_ROOT, NULL, }
+#define CFQ_RB_ROOT    (struct cfq_rb_root) { RB_ROOT, NULL, 0, }
 
 /*
  * Per process-grouping structure
@@ -112,19 +122,57 @@ struct cfq_queue {
        unsigned short ioprio, org_ioprio;
        unsigned short ioprio_class, org_ioprio_class;
 
+       unsigned int seek_samples;
+       u64 seek_total;
+       sector_t seek_mean;
+       sector_t last_request_pos;
+       unsigned long seeky_start;
+
        pid_t pid;
+
+       struct cfq_rb_root *service_tree;
+       struct cfq_queue *new_cfqq;
+};
+
+/*
+ * First index in the service_trees.
+ * IDLE is handled separately, so it has negative index
+ */
+enum wl_prio_t {
+       IDLE_WORKLOAD = -1,
+       BE_WORKLOAD = 0,
+       RT_WORKLOAD = 1
 };
 
 /*
+ * Second index in the service_trees.
+ */
+enum wl_type_t {
+       ASYNC_WORKLOAD = 0,
+       SYNC_NOIDLE_WORKLOAD = 1,
+       SYNC_WORKLOAD = 2
+};
+
+
+/*
  * Per block device queue structure
  */
 struct cfq_data {
        struct request_queue *queue;
 
        /*
-        * rr list of queues with requests and the count of them
+        * rr lists of queues with requests, onle rr for each priority class.
+        * Counts are embedded in the cfq_rb_root
         */
-       struct cfq_rb_root service_tree;
+       struct cfq_rb_root service_trees[2][3];
+       struct cfq_rb_root service_tree_idle;
+       /*
+        * The priority currently being served
+        */
+       enum wl_prio_t serving_prio;
+       enum wl_type_t serving_type;
+       unsigned long workload_expires;
+       bool noidle_tree_requires_idle;
 
        /*
         * Each priority tree is sorted by next_request position.  These
@@ -134,6 +182,7 @@ struct cfq_data {
        struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
        unsigned int busy_queues;
+       unsigned int busy_queues_avg[2];
 
        int rq_in_driver[2];
        int sync_flight;
@@ -143,8 +192,14 @@ struct cfq_data {
         */
        int rq_queued;
        int hw_tag;
-       int hw_tag_samples;
-       int rq_in_driver_peak;
+       /*
+        * hw_tag can be
+        * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
+        *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
+        *  0 => no NCQ
+        */
+       int hw_tag_est_depth;
+       unsigned int hw_tag_samples;
 
        /*
         * idle window management
@@ -185,6 +240,16 @@ struct cfq_data {
        unsigned long last_end_sync_rq;
 };
 
+static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio,
+                                           enum wl_type_t type,
+                                           struct cfq_data *cfqd)
+{
+       if (prio == IDLE_WORKLOAD)
+               return &cfqd->service_tree_idle;
+
+       return &cfqd->service_trees[prio][type];
+}
+
 enum cfqq_state_flags {
        CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
        CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
@@ -195,8 +260,8 @@ enum cfqq_state_flags {
        CFQ_CFQQ_FLAG_prio_changed,     /* task priority has changed */
        CFQ_CFQQ_FLAG_slice_new,        /* no requests dispatched in slice */
        CFQ_CFQQ_FLAG_sync,             /* synchronous queue */
-       CFQ_CFQQ_FLAG_coop,             /* has done a coop jump of the queue */
-       CFQ_CFQQ_FLAG_coop_preempt,     /* coop preempt */
+       CFQ_CFQQ_FLAG_coop,             /* cfqq is shared */
+       CFQ_CFQQ_FLAG_deep,             /* sync cfqq experienced large depth */
 };
 
 #define CFQ_CFQQ_FNS(name)                                             \
@@ -223,7 +288,7 @@ CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
-CFQ_CFQQ_FNS(coop_preempt);
+CFQ_CFQQ_FNS(deep);
 #undef CFQ_CFQQ_FNS
 
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
@@ -231,6 +296,35 @@ CFQ_CFQQ_FNS(coop_preempt);
 #define cfq_log(cfqd, fmt, args...)    \
        blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 
+static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
+{
+       if (cfq_class_idle(cfqq))
+               return IDLE_WORKLOAD;
+       if (cfq_class_rt(cfqq))
+               return RT_WORKLOAD;
+       return BE_WORKLOAD;
+}
+
+
+static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
+{
+       if (!cfq_cfqq_sync(cfqq))
+               return ASYNC_WORKLOAD;
+       if (!cfq_cfqq_idle_window(cfqq))
+               return SYNC_NOIDLE_WORKLOAD;
+       return SYNC_WORKLOAD;
+}
+
+static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd)
+{
+       if (wl == IDLE_WORKLOAD)
+               return cfqd->service_tree_idle.count;
+
+       return cfqd->service_trees[wl][ASYNC_WORKLOAD].count
+               + cfqd->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
+               + cfqd->service_trees[wl][SYNC_WORKLOAD].count;
+}
+
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
                                       struct io_context *, gfp_t);
@@ -303,10 +397,49 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
+/*
+ * get averaged number of queues of RT/BE priority.
+ * average is updated, with a formula that gives more weight to higher numbers,
+ * to quickly follows sudden increases and decrease slowly
+ */
+
+static inline unsigned cfq_get_avg_queues(struct cfq_data *cfqd, bool rt)
+{
+       unsigned min_q, max_q;
+       unsigned mult  = cfq_hist_divisor - 1;
+       unsigned round = cfq_hist_divisor / 2;
+       unsigned busy = cfq_busy_queues_wl(rt, cfqd);
+
+       min_q = min(cfqd->busy_queues_avg[rt], busy);
+       max_q = max(cfqd->busy_queues_avg[rt], busy);
+       cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
+               cfq_hist_divisor;
+       return cfqd->busy_queues_avg[rt];
+}
+
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-       cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+       unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
+       if (cfqd->cfq_latency) {
+               /* interested queues (we consider only the ones with the same
+                * priority class) */
+               unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
+               unsigned sync_slice = cfqd->cfq_slice[1];
+               unsigned expect_latency = sync_slice * iq;
+               if (expect_latency > cfq_target_latency) {
+                       unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
+                       /* scale low_slice according to IO priority
+                        * and sync vs async */
+                       unsigned low_slice =
+                               min(slice, base_low_slice * slice / sync_slice);
+                       /* the adapted slice value is scaled to fit all iqs
+                        * into the target latency */
+                       slice = max(slice * cfq_target_latency / expect_latency,
+                                   low_slice);
+               }
+       }
+       cfqq->slice_end = jiffies + slice;
        cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
 
@@ -331,9 +464,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq)
  * behind the head is penalized and only allowed to a certain extent.
  */
 static struct request *
-cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
+cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
 {
-       sector_t last, s1, s2, d1 = 0, d2 = 0;
+       sector_t s1, s2, d1 = 0, d2 = 0;
        unsigned long back_max;
 #define CFQ_RQ1_WRAP   0x01 /* request 1 wraps */
 #define CFQ_RQ2_WRAP   0x02 /* request 2 wraps */
@@ -356,8 +489,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
        s1 = blk_rq_pos(rq1);
        s2 = blk_rq_pos(rq2);
 
-       last = cfqd->last_position;
-
        /*
         * by definition, 1KiB is 2 sectors
         */
@@ -445,6 +576,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
        if (root->left == n)
                root->left = NULL;
        rb_erase_init(n, &root->rb);
+       --root->count;
 }
 
 /*
@@ -471,7 +603,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                        next = rb_entry_rq(rbnext);
        }
 
-       return cfq_choose_req(cfqd, next, prev);
+       return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
 }
 
 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
@@ -485,7 +617,7 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 }
 
 /*
- * The cfqd->service_tree holds all pending cfq_queue's that have
+ * The cfqd->service_trees holds all pending cfq_queue's that have
  * requests waiting to be processed. It is sorted in the order that
  * we will service the queues.
  */
@@ -495,11 +627,13 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        struct rb_node **p, *parent;
        struct cfq_queue *__cfqq;
        unsigned long rb_key;
+       struct cfq_rb_root *service_tree;
        int left;
 
+       service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd);
        if (cfq_class_idle(cfqq)) {
                rb_key = CFQ_IDLE_DELAY;
-               parent = rb_last(&cfqd->service_tree.rb);
+               parent = rb_last(&service_tree->rb);
                if (parent && parent != &cfqq->rb_node) {
                        __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
                        rb_key += __cfqq->rb_key;
@@ -517,7 +651,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                cfqq->slice_resid = 0;
        } else {
                rb_key = -HZ;
-               __cfqq = cfq_rb_first(&cfqd->service_tree);
+               __cfqq = cfq_rb_first(service_tree);
                rb_key += __cfqq ? __cfqq->rb_key : jiffies;
        }
 
@@ -525,15 +659,18 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                /*
                 * same position, nothing more to do
                 */
-               if (rb_key == cfqq->rb_key)
+               if (rb_key == cfqq->rb_key &&
+                   cfqq->service_tree == service_tree)
                        return;
 
-               cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+               cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
+               cfqq->service_tree = NULL;
        }
 
        left = 1;
        parent = NULL;
-       p = &cfqd->service_tree.rb.rb_node;
+       cfqq->service_tree = service_tree;
+       p = &service_tree->rb.rb_node;
        while (*p) {
                struct rb_node **n;
 
@@ -541,35 +678,25 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 
                /*
-                * sort RT queues first, we always want to give
-                * preference to them. IDLE queues goes to the back.
-                * after that, sort on the next service time.
+                * sort by key, that represents service time.
                 */
-               if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
+               if (time_before(rb_key, __cfqq->rb_key))
                        n = &(*p)->rb_left;
-               else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
-                       n = &(*p)->rb_right;
-               else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
-                       n = &(*p)->rb_left;
-               else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
-                       n = &(*p)->rb_right;
-               else if (time_before(rb_key, __cfqq->rb_key))
-                       n = &(*p)->rb_left;
-               else
+               else {
                        n = &(*p)->rb_right;
-
-               if (n == &(*p)->rb_right)
                        left = 0;
+               }
 
                p = n;
        }
 
        if (left)
-               cfqd->service_tree.left = &cfqq->rb_node;
+               service_tree->left = &cfqq->rb_node;
 
        cfqq->rb_key = rb_key;
        rb_link_node(&cfqq->rb_node, parent, p);
-       rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
+       rb_insert_color(&cfqq->rb_node, &service_tree->rb);
+       service_tree->count++;
 }
 
 static struct cfq_queue *
@@ -671,8 +798,10 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        BUG_ON(!cfq_cfqq_on_rr(cfqq));
        cfq_clear_cfqq_on_rr(cfqq);
 
-       if (!RB_EMPTY_NODE(&cfqq->rb_node))
-               cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+       if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+               cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
+               cfqq->service_tree = NULL;
+       }
        if (cfqq->p_root) {
                rb_erase(&cfqq->p_node, cfqq->p_root);
                cfqq->p_root = NULL;
@@ -722,7 +851,7 @@ static void cfq_add_rq_rb(struct request *rq)
         * check if this request is a better next-serve candidate
         */
        prev = cfqq->next_rq;
-       cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
+       cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
 
        /*
         * adjust priority tree position, if ->next_rq changes
@@ -829,6 +958,7 @@ static void
 cfq_merged_requests(struct request_queue *q, struct request *rq,
                    struct request *next)
 {
+       struct cfq_queue *cfqq = RQ_CFQQ(rq);
        /*
         * reposition in fifo if next is older than rq
         */
@@ -838,6 +968,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
                rq_set_fifo_time(rq, rq_fifo_time(next));
        }
 
+       if (cfqq->next_rq == next)
+               cfqq->next_rq = rq;
        cfq_remove_request(next);
 }
 
@@ -933,10 +1065,12 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
  */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
-       if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
-               return NULL;
+       struct cfq_rb_root *service_tree =
+               service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd);
 
-       return cfq_rb_first(&cfqd->service_tree);
+       if (RB_EMPTY_ROOT(&service_tree->rb))
+               return NULL;
+       return cfq_rb_first(service_tree);
 }
 
 /*
@@ -945,14 +1079,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
                                              struct cfq_queue *cfqq)
 {
-       if (!cfqq) {
+       if (!cfqq)
                cfqq = cfq_get_next_queue(cfqd);
-               if (cfqq && !cfq_cfqq_coop_preempt(cfqq))
-                       cfq_clear_cfqq_coop(cfqq);
-       }
-
-       if (cfqq)
-               cfq_clear_cfqq_coop_preempt(cfqq);
 
        __cfq_set_active_queue(cfqd, cfqq);
        return cfqq;
@@ -967,16 +1095,16 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
                return cfqd->last_position - blk_rq_pos(rq);
 }
 
-#define CIC_SEEK_THR   8 * 1024
-#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR)
+#define CFQQ_SEEK_THR          8 * 1024
+#define CFQQ_SEEKY(cfqq)       ((cfqq)->seek_mean > CFQQ_SEEK_THR)
 
-static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
+static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+                              struct request *rq)
 {
-       struct cfq_io_context *cic = cfqd->active_cic;
-       sector_t sdist = cic->seek_mean;
+       sector_t sdist = cfqq->seek_mean;
 
-       if (!sample_valid(cic->seek_samples))
-               sdist = CIC_SEEK_THR;
+       if (!sample_valid(cfqq->seek_samples))
+               sdist = CFQQ_SEEK_THR;
 
        return cfq_dist_from_last(cfqd, rq) <= sdist;
 }
@@ -1005,7 +1133,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
         * will contain the closest sector.
         */
        __cfqq = rb_entry(parent, struct cfq_queue, p_node);
-       if (cfq_rq_close(cfqd, __cfqq->next_rq))
+       if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
                return __cfqq;
 
        if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1016,7 +1144,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
                return NULL;
 
        __cfqq = rb_entry(node, struct cfq_queue, p_node);
-       if (cfq_rq_close(cfqd, __cfqq->next_rq))
+       if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
                return __cfqq;
 
        return NULL;
@@ -1033,16 +1161,13 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
  * assumption.
  */
 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
-                                             struct cfq_queue *cur_cfqq,
-                                             bool probe)
+                                             struct cfq_queue *cur_cfqq)
 {
        struct cfq_queue *cfqq;
 
-       /*
-        * A valid cfq_io_context is necessary to compare requests against
-        * the seek_mean of the current cfqq.
-        */
-       if (!cfqd->active_cic)
+       if (!cfq_cfqq_sync(cur_cfqq))
+               return NULL;
+       if (CFQQ_SEEKY(cur_cfqq))
                return NULL;
 
        /*
@@ -1054,14 +1179,53 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
        if (!cfqq)
                return NULL;
 
-       if (cfq_cfqq_coop(cfqq))
+       /*
+        * It only makes sense to merge sync queues.
+        */
+       if (!cfq_cfqq_sync(cfqq))
+               return NULL;
+       if (CFQQ_SEEKY(cfqq))
+               return NULL;
+
+       /*
+        * Do not merge queues of different priority classes
+        */
+       if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
                return NULL;
 
-       if (!probe)
-               cfq_mark_cfqq_coop(cfqq);
        return cfqq;
 }
 
+/*
+ * Determine whether we should enforce idle window for this queue.
+ */
+
+static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+       enum wl_prio_t prio = cfqq_prio(cfqq);
+       struct cfq_rb_root *service_tree = cfqq->service_tree;
+
+       /* We never do for idle class queues. */
+       if (prio == IDLE_WORKLOAD)
+               return false;
+
+       /* We do for queues that were marked with idle window flag. */
+       if (cfq_cfqq_idle_window(cfqq))
+               return true;
+
+       /*
+        * Otherwise, we do only if they are the last ones
+        * in their service tree.
+        */
+       if (!service_tree)
+               service_tree = service_tree_for(prio, cfqq_type(cfqq), cfqd);
+
+       if (service_tree->count == 0)
+               return true;
+
+       return (service_tree->count == 1 && cfq_rb_first(service_tree) == cfqq);
+}
+
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
        struct cfq_queue *cfqq = cfqd->active_queue;
@@ -1082,13 +1246,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
        /*
         * idle is disabled, either manually or by past process history
         */
-       if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
+       if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
                return;
 
        /*
-        * still requests with the driver, don't idle
+        * still active requests from this queue, don't idle
         */
-       if (rq_in_driver(cfqd))
+       if (cfqq->dispatched)
                return;
 
        /*
@@ -1109,14 +1273,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 
        cfq_mark_cfqq_wait_request(cfqq);
 
-       /*
-        * we don't want to idle for seeks, but we do want to allow
-        * fair distribution of slice time for a process doing back-to-back
-        * seeks. so allow a little bit of time for him to submit a new rq
-        */
        sl = cfqd->cfq_slice_idle;
-       if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
-               sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
 
        mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
        cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
@@ -1175,6 +1332,153 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 }
 
 /*
+ * Must be called with the queue_lock held.
+ */
+static int cfqq_process_refs(struct cfq_queue *cfqq)
+{
+       int process_refs, io_refs;
+
+       io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
+       process_refs = atomic_read(&cfqq->ref) - io_refs;
+       BUG_ON(process_refs < 0);
+       return process_refs;
+}
+
+static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
+{
+       int process_refs, new_process_refs;
+       struct cfq_queue *__cfqq;
+
+       /* Avoid a circular list and skip interim queue merges */
+       while ((__cfqq = new_cfqq->new_cfqq)) {
+               if (__cfqq == cfqq)
+                       return;
+               new_cfqq = __cfqq;
+       }
+
+       process_refs = cfqq_process_refs(cfqq);
+       /*
+        * If the process for the cfqq has gone away, there is no
+        * sense in merging the queues.
+        */
+       if (process_refs == 0)
+               return;
+
+       /*
+        * Merge in the direction of the lesser amount of work.
+        */
+       new_process_refs = cfqq_process_refs(new_cfqq);
+       if (new_process_refs >= process_refs) {
+               cfqq->new_cfqq = new_cfqq;
+               atomic_add(process_refs, &new_cfqq->ref);
+       } else {
+               new_cfqq->new_cfqq = cfqq;
+               atomic_add(new_process_refs, &cfqq->ref);
+       }
+}
+
+static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio,
+                                   bool prio_changed)
+{
+       struct cfq_queue *queue;
+       int i;
+       bool key_valid = false;
+       unsigned long lowest_key = 0;
+       enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
+
+       if (prio_changed) {
+               /*
+                * When priorities switched, we prefer starting
+                * from SYNC_NOIDLE (first choice), or just SYNC
+                * over ASYNC
+                */
+               if (service_tree_for(prio, cur_best, cfqd)->count)
+                       return cur_best;
+               cur_best = SYNC_WORKLOAD;
+               if (service_tree_for(prio, cur_best, cfqd)->count)
+                       return cur_best;
+
+               return ASYNC_WORKLOAD;
+       }
+
+       for (i = 0; i < 3; ++i) {
+               /* otherwise, select the one with lowest rb_key */
+               queue = cfq_rb_first(service_tree_for(prio, i, cfqd));
+               if (queue &&
+                   (!key_valid || time_before(queue->rb_key, lowest_key))) {
+                       lowest_key = queue->rb_key;
+                       cur_best = i;
+                       key_valid = true;
+               }
+       }
+
+       return cur_best;
+}
+
+static void choose_service_tree(struct cfq_data *cfqd)
+{
+       enum wl_prio_t previous_prio = cfqd->serving_prio;
+       bool prio_changed;
+       unsigned slice;
+       unsigned count;
+
+       /* Choose next priority. RT > BE > IDLE */
+       if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd))
+               cfqd->serving_prio = RT_WORKLOAD;
+       else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd))
+               cfqd->serving_prio = BE_WORKLOAD;
+       else {
+               cfqd->serving_prio = IDLE_WORKLOAD;
+               cfqd->workload_expires = jiffies + 1;
+               return;
+       }
+
+       /*
+        * For RT and BE, we have to choose also the type
+        * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
+        * expiration time
+        */
+       prio_changed = (cfqd->serving_prio != previous_prio);
+       count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd)
+               ->count;
+
+       /*
+        * If priority didn't change, check workload expiration,
+        * and that we still have other queues ready
+        */
+       if (!prio_changed && count &&
+           !time_after(jiffies, cfqd->workload_expires))
+               return;
+
+       /* otherwise select new workload type */
+       cfqd->serving_type =
+               cfq_choose_wl(cfqd, cfqd->serving_prio, prio_changed);
+       count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd)
+               ->count;
+
+       /*
+        * the workload slice is computed as a fraction of target latency
+        * proportional to the number of queues in that workload, over
+        * all the queues in the same priority class
+        */
+       slice = cfq_target_latency * count /
+               max_t(unsigned, cfqd->busy_queues_avg[cfqd->serving_prio],
+                     cfq_busy_queues_wl(cfqd->serving_prio, cfqd));
+
+       if (cfqd->serving_type == ASYNC_WORKLOAD)
+               /* async workload slice is scaled down according to
+                * the sync/async slice ratio. */
+               slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
+       else
+               /* sync workload slice is at least 2 * cfq_slice_idle */
+               slice = max(slice, 2 * cfqd->cfq_slice_idle);
+
+       slice = max_t(unsigned, slice, CFQ_MIN_TT);
+       cfqd->workload_expires = jiffies + slice;
+       cfqd->noidle_tree_requires_idle = false;
+}
+
+/*
  * Select a queue for service. If we have a current active queue,
  * check whether to continue servicing it, or retrieve and set a new one.
  */
@@ -1203,11 +1507,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
         * If another queue has a request waiting within our mean seek
         * distance, let it run.  The expire code will check for close
         * cooperators and put the close queue at the front of the service
-        * tree.
+        * tree.  If possible, merge the expiring queue with the new cfqq.
         */
-       new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
-       if (new_cfqq)
+       new_cfqq = cfq_close_cooperator(cfqd, cfqq);
+       if (new_cfqq) {
+               if (!cfqq->new_cfqq)
+                       cfq_setup_merge(cfqq, new_cfqq);
                goto expire;
+       }
 
        /*
         * No requests pending. If the active queue still has requests in
@@ -1215,7 +1522,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
         * conditions to happen (or time out) before selecting a new queue.
         */
        if (timer_pending(&cfqd->idle_slice_timer) ||
-           (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
+           (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
                cfqq = NULL;
                goto keep_queue;
        }
@@ -1223,6 +1530,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 expire:
        cfq_slice_expired(cfqd, 0);
 new_queue:
+       /*
+        * Current queue expired. Check if we have to switch to a new
+        * service tree
+        */
+       if (!new_cfqq)
+               choose_service_tree(cfqd);
+
        cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
        return cfqq;
@@ -1249,8 +1563,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
        struct cfq_queue *cfqq;
        int dispatched = 0;
-
-       while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
+       int i, j;
+       for (i = 0; i < 2; ++i)
+               for (j = 0; j < 3; ++j)
+                       while ((cfqq = cfq_rb_first(&cfqd->service_trees[i][j]))
+                               != NULL)
+                               dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+
+       while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL)
                dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
        cfq_slice_expired(cfqd, 0);
@@ -1268,7 +1588,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
        /*
         * Drain async requests before we start sync IO
         */
-       if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+       if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
                return false;
 
        /*
@@ -1298,9 +1618,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
                        return false;
 
                /*
-                * Sole queue user, allow bigger slice
+                * Sole queue user, no limit
                 */
-               max_dispatch *= 4;
+               max_dispatch = -1;
        }
 
        /*
@@ -1518,11 +1838,29 @@ static void cfq_free_io_context(struct io_context *ioc)
 
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+       struct cfq_queue *__cfqq, *next;
+
        if (unlikely(cfqq == cfqd->active_queue)) {
                __cfq_slice_expired(cfqd, cfqq, 0);
                cfq_schedule_dispatch(cfqd);
        }
 
+       /*
+        * If this queue was scheduled to merge with another queue, be
+        * sure to drop the reference taken on that queue (and others in
+        * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
+        */
+       __cfqq = cfqq->new_cfqq;
+       while (__cfqq) {
+               if (__cfqq == cfqq) {
+                       WARN(1, "cfqq->new_cfqq loop detected\n");
+                       break;
+               }
+               next = __cfqq->new_cfqq;
+               cfq_put_queue(__cfqq);
+               __cfqq = next;
+       }
+
        cfq_put_queue(cfqq);
 }
 
@@ -1952,33 +2290,46 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 }
 
 static void
-cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                       struct request *rq)
 {
        sector_t sdist;
        u64 total;
 
-       if (!cic->last_request_pos)
+       if (!cfqq->last_request_pos)
                sdist = 0;
-       else if (cic->last_request_pos < blk_rq_pos(rq))
-               sdist = blk_rq_pos(rq) - cic->last_request_pos;
+       else if (cfqq->last_request_pos < blk_rq_pos(rq))
+               sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
        else
-               sdist = cic->last_request_pos - blk_rq_pos(rq);
+               sdist = cfqq->last_request_pos - blk_rq_pos(rq);
 
        /*
         * Don't allow the seek distance to get too large from the
         * odd fragment, pagein, etc
         */
-       if (cic->seek_samples <= 60) /* second&third seek */
-               sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
+       if (cfqq->seek_samples <= 60) /* second&third seek */
+               sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
        else
-               sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
+               sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
+
+       cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
+       cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
+       total = cfqq->seek_total + (cfqq->seek_samples/2);
+       do_div(total, cfqq->seek_samples);
+       cfqq->seek_mean = (sector_t)total;
 
-       cic->seek_samples = (7*cic->seek_samples + 256) / 8;
-       cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
-       total = cic->seek_total + (cic->seek_samples/2);
-       do_div(total, cic->seek_samples);
-       cic->seek_mean = (sector_t)total;
+       /*
+        * If this cfqq is shared between multiple processes, check to
+        * make sure that those processes are still issuing I/Os within
+        * the mean seek distance.  If not, it may be time to break the
+        * queues apart again.
+        */
+       if (cfq_cfqq_coop(cfqq)) {
+               if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
+                       cfqq->seeky_start = jiffies;
+               else if (!CFQQ_SEEKY(cfqq))
+                       cfqq->seeky_start = 0;
+       }
 }
 
 /*
@@ -1999,14 +2350,15 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
        enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 
+       if (cfqq->queued[0] + cfqq->queued[1] >= 4)
+               cfq_mark_cfqq_deep(cfqq);
+
        if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-           (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic)))
+           (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
+            && CFQQ_SEEKY(cfqq)))
                enable_idle = 0;
        else if (sample_valid(cic->ttime_samples)) {
-               unsigned int slice_idle = cfqd->cfq_slice_idle;
-               if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
-                       slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
-               if (cic->ttime_mean > slice_idle)
+               if (cic->ttime_mean > cfqd->cfq_slice_idle)
                        enable_idle = 0;
                else
                        enable_idle = 1;
@@ -2044,6 +2396,11 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
        if (cfq_class_idle(cfqq))
                return true;
 
+       if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
+           cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
+           new_cfqq->service_tree->count == 1)
+               return true;
+
        /*
         * if the new request is sync, but the currently running queue is
         * not, let the sync request have priority.
@@ -2071,16 +2428,8 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
         * if this request is as-good as one we would expect from the
         * current cfqq, let it preempt
         */
-       if (cfq_rq_close(cfqd, rq) && (!cfq_cfqq_coop(new_cfqq) ||
-           cfqd->busy_queues == 1)) {
-               /*
-                * Mark new queue coop_preempt, so its coop flag will not be
-                * cleared when new queue gets scheduled at the very first time
-                */
-               cfq_mark_cfqq_coop_preempt(new_cfqq);
-               cfq_mark_cfqq_coop(new_cfqq);
+       if (cfq_rq_close(cfqd, cfqq, rq))
                return true;
-       }
 
        return false;
 }
@@ -2121,10 +2470,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
                cfqq->meta_pending++;
 
        cfq_update_io_thinktime(cfqd, cic);
-       cfq_update_io_seektime(cfqd, cic, rq);
+       cfq_update_io_seektime(cfqd, cfqq, rq);
        cfq_update_idle_window(cfqd, cfqq, cic);
 
-       cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+       cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
        if (cfqq == cfqd->active_queue) {
                /*
@@ -2165,10 +2514,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
        cfq_log_cfqq(cfqd, cfqq, "insert_request");
        cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
 
-       cfq_add_rq_rb(rq);
-
        rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
        list_add_tail(&rq->queuelist, &cfqq->fifo);
+       cfq_add_rq_rb(rq);
 
        cfq_rq_enqueued(cfqd, cfqq, rq);
 }
@@ -2179,23 +2527,35 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
  */
 static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
-       if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
-               cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
+       struct cfq_queue *cfqq = cfqd->active_queue;
+
+       if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
+               cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
+
+       if (cfqd->hw_tag == 1)
+               return;
 
        if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
            rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
                return;
 
+       /*
+        * If active queue hasn't enough requests and can idle, cfq might not
+        * dispatch sufficient requests to hardware. Don't zero hw_tag in this
+        * case
+        */
+       if (cfqq && cfq_cfqq_idle_window(cfqq) &&
+           cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
+           CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
+               return;
+
        if (cfqd->hw_tag_samples++ < 50)
                return;
 
-       if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
+       if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
                cfqd->hw_tag = 1;
        else
                cfqd->hw_tag = 0;
-
-       cfqd->hw_tag_samples = 0;
-       cfqd->rq_in_driver_peak = 0;
 }
 
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
@@ -2235,17 +2595,27 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
                        cfq_clear_cfqq_slice_new(cfqq);
                }
                /*
-                * If there are no requests waiting in this queue, and
-                * there are other queues ready to issue requests, AND
-                * those other queues are issuing requests within our
-                * mean seek distance, give them a chance to run instead
-                * of idling.
+                * Idling is not enabled on:
+                * - expired queues
+                * - idle-priority queues
+                * - async queues
+                * - queues with still some requests queued
+                * - when there is a close cooperator
                 */
                if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
                        cfq_slice_expired(cfqd, 1);
-               else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
-                        sync && !rq_noidle(rq))
-                       cfq_arm_slice_timer(cfqd);
+               else if (sync && cfqq_empty &&
+                        !cfq_close_cooperator(cfqd, cfqq)) {
+                       cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
+                       /*
+                        * Idling is enabled for SYNC_WORKLOAD.
+                        * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
+                        * only if we processed at least one !rq_noidle request
+                        */
+                       if (cfqd->serving_type == SYNC_WORKLOAD
+                           || cfqd->noidle_tree_requires_idle)
+                               cfq_arm_slice_timer(cfqd);
+               }
        }
 
        if (!rq_in_driver(cfqd))
@@ -2269,12 +2639,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
                        cfqq->ioprio = IOPRIO_NORM;
        } else {
                /*
-                * check if we need to unboost the queue
+                * unboost the queue (if needed)
                 */
-               if (cfqq->ioprio_class != cfqq->org_ioprio_class)
-                       cfqq->ioprio_class = cfqq->org_ioprio_class;
-               if (cfqq->ioprio != cfqq->org_ioprio)
-                       cfqq->ioprio = cfqq->org_ioprio;
+               cfqq->ioprio_class = cfqq->org_ioprio_class;
+               cfqq->ioprio = cfqq->org_ioprio;
        }
 }
 
@@ -2338,6 +2706,43 @@ static void cfq_put_request(struct request *rq)
        }
 }
 
+static struct cfq_queue *
+cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
+               struct cfq_queue *cfqq)
+{
+       cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
+       cic_set_cfqq(cic, cfqq->new_cfqq, 1);
+       cfq_mark_cfqq_coop(cfqq->new_cfqq);
+       cfq_put_queue(cfqq);
+       return cic_to_cfqq(cic, 1);
+}
+
+static int should_split_cfqq(struct cfq_queue *cfqq)
+{
+       if (cfqq->seeky_start &&
+           time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
+               return 1;
+       return 0;
+}
+
+/*
+ * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
+ * was the last process referring to said cfqq.
+ */
+static struct cfq_queue *
+split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
+{
+       if (cfqq_process_refs(cfqq) == 1) {
+               cfqq->seeky_start = 0;
+               cfqq->pid = current->pid;
+               cfq_clear_cfqq_coop(cfqq);
+               return cfqq;
+       }
+
+       cic_set_cfqq(cic, NULL, 1);
+       cfq_put_queue(cfqq);
+       return NULL;
+}
 /*
  * Allocate cfq data structures associated with this request.
  */
@@ -2360,10 +2765,30 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
        if (!cic)
                goto queue_fail;
 
+new_queue:
        cfqq = cic_to_cfqq(cic, is_sync);
        if (!cfqq || cfqq == &cfqd->oom_cfqq) {
                cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
                cic_set_cfqq(cic, cfqq, is_sync);
+       } else {
+               /*
+                * If the queue was seeky for too long, break it apart.
+                */
+               if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
+                       cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
+                       cfqq = split_cfqq(cic, cfqq);
+                       if (!cfqq)
+                               goto new_queue;
+               }
+
+               /*
+                * Check to see if this queue is scheduled to merge with
+                * another, closely cooperating queue.  The merging of
+                * queues happens here as it must be done in process context.
+                * The reference on new_cfqq was taken in merge_cfqqs.
+                */
+               if (cfqq->new_cfqq)
+                       cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
        }
 
        cfqq->allocated[rw]++;
@@ -2438,6 +2863,11 @@ static void cfq_idle_slice_timer(unsigned long data)
                 */
                if (!RB_EMPTY_ROOT(&cfqq->sort_list))
                        goto out_kick;
+
+               /*
+                * Queue depth flag is reset only when the idle didn't succeed
+                */
+               cfq_clear_cfqq_deep(cfqq);
        }
 expire:
        cfq_slice_expired(cfqd, timed_out);
@@ -2500,13 +2930,16 @@ static void cfq_exit_queue(struct elevator_queue *e)
 static void *cfq_init_queue(struct request_queue *q)
 {
        struct cfq_data *cfqd;
-       int i;
+       int i, j;
 
        cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
        if (!cfqd)
                return NULL;
 
-       cfqd->service_tree = CFQ_RB_ROOT;
+       for (i = 0; i < 2; ++i)
+               for (j = 0; j < 3; ++j)
+                       cfqd->service_trees[i][j] = CFQ_RB_ROOT;
+       cfqd->service_tree_idle = CFQ_RB_ROOT;
 
        /*
         * Not strictly needed (since RB_ROOT just clears the node and we
@@ -2544,7 +2977,7 @@ static void *cfq_init_queue(struct request_queue *q)
        cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
        cfqd->cfq_slice_idle = cfq_slice_idle;
        cfqd->cfq_latency = 1;
-       cfqd->hw_tag = 1;
+       cfqd->hw_tag = -1;
        cfqd->last_end_sync_rq = jiffies;
        return cfqd;
 }
index 9bd086c..4eb8e9e 100644 (file)
@@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                return compat_put_uint(arg, bdev_io_opt(bdev));
        case BLKALIGNOFF:
                return compat_put_int(arg, bdev_alignment_offset(bdev));
+       case BLKDISCARDZEROES:
+               return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
        case BLKFLSBUF:
        case BLKROSET:
        case BLKDISCARD:
index a847046..9ad5ccc 100644 (file)
@@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name)
 
                spin_unlock(&elv_list_lock);
 
-               if (!strcmp(name, "anticipatory"))
-                       sprintf(elv, "as-iosched");
-               else
-                       sprintf(elv, "%s-iosched", name);
+               sprintf(elv, "%s-iosched", name);
 
                request_module("%s", elv);
                spin_lock(&elv_list_lock);
@@ -193,10 +190,7 @@ static int __init elevator_setup(char *str)
         * Be backwards-compatible with previous kernels, so users
         * won't get the wrong elevator.
         */
-       if (!strcmp(str, "as"))
-               strcpy(chosen_elevator, "anticipatory");
-       else
-               strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
+       strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
        return 1;
 }
 
index 517e433..b11a4ad 100644 (file)
@@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
        return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
 }
 
+static ssize_t disk_discard_alignment_show(struct device *dev,
+                                          struct device_attribute *attr,
+                                          char *buf)
+{
+       struct gendisk *disk = dev_to_disk(dev);
+
+       return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
+                  NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
        &dev_attr_ro.attr,
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
+       &dev_attr_discard_alignment.attr,
        &dev_attr_capability.attr,
        &dev_attr_stat.attr,
        &dev_attr_inflight.attr,
index 1f4d1de..be48ea5 100644 (file)
@@ -280,6 +280,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                return put_uint(arg, bdev_io_opt(bdev));
        case BLKALIGNOFF:
                return put_int(arg, bdev_alignment_offset(bdev));
+       case BLKDISCARDZEROES:
+               return put_uint(arg, bdev_discard_zeroes_data(bdev));
        case BLKSECTGET:
                return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
        case BLKRASET:
index e5b1001..a8b5a10 100644 (file)
@@ -35,7 +35,9 @@
 struct blk_cmd_filter {
        unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
        unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
-} blk_default_cmd_filter;
+};
+
+static struct blk_cmd_filter blk_default_cmd_filter;
 
 /* Command group 3 is reserved and should never be used.  */
 const unsigned char scsi_command_size_tbl[8] =
@@ -675,7 +677,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
 
-int __init blk_scsi_ioctl_init(void)
+static int __init blk_scsi_ioctl_init(void)
 {
        blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
        return 0;
index 1d886e0..77bfce5 100644 (file)
@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
          instead, which can be configured to be on-disk compatible with the
          cryptoloop device.
 
+source "drivers/block/drbd/Kconfig"
+
 config BLK_DEV_NBD
        tristate "Network block device support"
        depends on NET
index cdaa3f8..aff5ac9 100644 (file)
@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB)      += ub.o
 obj-$(CONFIG_BLK_DEV_HD)       += hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += xen-blkfront.o
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
 
 swim_mod-objs  := swim.o swim_asm.o
index 92b1263..873e594 100644 (file)
@@ -179,19 +179,17 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl);
 static int deregister_disk(ctlr_info_t *h, int drv_index,
                           int clear_all, int via_ioctl);
 
-static void cciss_read_capacity(int ctlr, int logvol, int withirq,
+static void cciss_read_capacity(int ctlr, int logvol,
                        sector_t *total_size, unsigned int *block_size);
-static void cciss_read_capacity_16(int ctlr, int logvol, int withirq,
+static void cciss_read_capacity_16(int ctlr, int logvol,
                        sector_t *total_size, unsigned int *block_size);
 static void cciss_geometry_inquiry(int ctlr, int logvol,
-                       int withirq, sector_t total_size,
+                       sector_t total_size,
                        unsigned int block_size, InquiryData_struct *inq_buff,
                                   drive_info_struct *drv);
 static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *,
                                           __u32);
 static void start_io(ctlr_info_t *h);
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-                  __u8 page_code, unsigned char *scsi3addr, int cmd_type);
 static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
                        __u8 page_code, unsigned char scsi3addr[],
                        int cmd_type);
@@ -424,12 +422,9 @@ cciss_proc_write(struct file *file, const char __user *buf,
        if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) {
                struct seq_file *seq = file->private_data;
                ctlr_info_t *h = seq->private;
-               int rc;
 
-               rc = cciss_engage_scsi(h->ctlr);
-               if (rc != 0)
-                       err = -rc;
-               else
+               err = cciss_engage_scsi(h->ctlr);
+               if (err == 0)
                        err = length;
        } else
 #endif /* CONFIG_CISS_SCSI_TAPE */
@@ -1657,9 +1652,11 @@ static void cciss_softirq_done(struct request *rq)
 {
        CommandList_struct *cmd = rq->completion_data;
        ctlr_info_t *h = hba[cmd->ctlr];
+       SGDescriptor_struct *curr_sg = cmd->SG;
        unsigned long flags;
        u64bit temp64;
        int i, ddir;
+       int sg_index = 0;
 
        if (cmd->Request.Type.Direction == XFER_READ)
                ddir = PCI_DMA_FROMDEVICE;
@@ -1669,9 +1666,22 @@ static void cciss_softirq_done(struct request *rq)
        /* command did not need to be retried */
        /* unmap the DMA mapping for all the scatter gather elements */
        for (i = 0; i < cmd->Header.SGList; i++) {
-               temp64.val32.lower = cmd->SG[i].Addr.lower;
-               temp64.val32.upper = cmd->SG[i].Addr.upper;
-               pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir);
+               if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) {
+                       temp64.val32.lower = cmd->SG[i].Addr.lower;
+                       temp64.val32.upper = cmd->SG[i].Addr.upper;
+                       pci_dma_sync_single_for_cpu(h->pdev, temp64.val,
+                                               cmd->SG[i].Len, ddir);
+                       pci_unmap_single(h->pdev, temp64.val,
+                                               cmd->SG[i].Len, ddir);
+                       /* Point to the next block */
+                       curr_sg = h->cmd_sg_list[cmd->cmdindex]->sgchain;
+                       sg_index = 0;
+               }
+               temp64.val32.lower = curr_sg[sg_index].Addr.lower;
+               temp64.val32.upper = curr_sg[sg_index].Addr.upper;
+               pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len,
+                               ddir);
+               ++sg_index;
        }
 
 #ifdef CCISS_DEBUG
@@ -1701,7 +1711,7 @@ static inline void log_unit_to_scsi3addr(ctlr_info_t *h,
  * via the inquiry page 0.  Model, vendor, and rev are set to empty strings if
  * they cannot be read.
  */
-static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
+static void cciss_get_device_descr(int ctlr, int logvol,
                                   char *vendor, char *model, char *rev)
 {
        int rc;
@@ -1717,14 +1727,8 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
                return;
 
        log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-       if (withirq)
-               rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf,
-                            sizeof(InquiryData_struct), 0,
-                               scsi3addr, TYPE_CMD);
-       else
-               rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
-                            sizeof(InquiryData_struct), 0,
-                               scsi3addr, TYPE_CMD);
+       rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0,
+                       scsi3addr, TYPE_CMD);
        if (rc == IO_OK) {
                memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
                vendor[VENDOR_LEN] = '\0';
@@ -1743,7 +1747,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
  * number cannot be had, for whatever reason, 16 bytes of 0xff
  * are returned instead.
  */
-static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
+static void cciss_get_serial_no(int ctlr, int logvol,
                                unsigned char *serial_no, int buflen)
 {
 #define PAGE_83_INQ_BYTES 64
@@ -1759,12 +1763,8 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
                return;
        memset(serial_no, 0, buflen);
        log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-       if (withirq)
-               rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
-                       PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
-       else
-               rc = sendcmd(CISS_INQUIRY, ctlr, buf,
-                       PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
+       rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
+               PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
        if (rc == IO_OK)
                memcpy(serial_no, &buf[8], buflen);
        kfree(buf);
@@ -1793,10 +1793,10 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
        blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
 
        /* This is a hardware imposed limit. */
-       blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES);
+       blk_queue_max_hw_segments(disk->queue, h->maxsgentries);
 
        /* This is a limit in the driver and could be eliminated. */
-       blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES);
+       blk_queue_max_phys_segments(disk->queue, h->maxsgentries);
 
        blk_queue_max_sectors(disk->queue, h->cciss_max_sectors);
 
@@ -1852,18 +1852,16 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
 
        /* testing to see if 16-byte CDBs are already being used */
        if (h->cciss_read == CCISS_READ_16) {
-               cciss_read_capacity_16(h->ctlr, drv_index, 1,
+               cciss_read_capacity_16(h->ctlr, drv_index,
                        &total_size, &block_size);
 
        } else {
-               cciss_read_capacity(ctlr, drv_index, 1,
-                                   &total_size, &block_size);
-
+               cciss_read_capacity(ctlr, drv_index, &total_size, &block_size);
                /* if read_capacity returns all F's this volume is >2TB */
                /* in size so we switch to 16-byte CDB's for all */
                /* read/write ops */
                if (total_size == 0xFFFFFFFFULL) {
-                       cciss_read_capacity_16(ctlr, drv_index, 1,
+                       cciss_read_capacity_16(ctlr, drv_index,
                        &total_size, &block_size);
                        h->cciss_read = CCISS_READ_16;
                        h->cciss_write = CCISS_WRITE_16;
@@ -1873,14 +1871,14 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
                }
        }
 
-       cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size,
+       cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size,
                               inq_buff, drvinfo);
        drvinfo->block_size = block_size;
        drvinfo->nr_blocks = total_size + 1;
 
-       cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor,
+       cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor,
                                drvinfo->model, drvinfo->rev);
-       cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no,
+       cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no,
                        sizeof(drvinfo->serial_no));
        /* Save the lunid in case we deregister the disk, below. */
        memcpy(drvinfo->LunID, h->drv[drv_index]->LunID,
@@ -2531,6 +2529,8 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
                case 0: return IO_OK; /* no sense */
                case 1: return IO_OK; /* recovered error */
                default:
+                       if (check_for_unit_attention(h, c))
+                               return IO_NEEDS_RETRY;
                        printk(KERN_WARNING "cciss%d: cmd 0x%02x "
                                "check condition, sense key = 0x%02x\n",
                                h->ctlr, c->Request.CDB[0],
@@ -2672,7 +2672,7 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
 }
 
 static void cciss_geometry_inquiry(int ctlr, int logvol,
-                                  int withirq, sector_t total_size,
+                                  sector_t total_size,
                                   unsigned int block_size,
                                   InquiryData_struct *inq_buff,
                                   drive_info_struct *drv)
@@ -2683,14 +2683,8 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
 
        memset(inq_buff, 0, sizeof(InquiryData_struct));
        log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-       if (withirq)
-               return_code = sendcmd_withirq(CISS_INQUIRY, ctlr,
-                                             inq_buff, sizeof(*inq_buff),
-                                             0xC1, scsi3addr, TYPE_CMD);
-       else
-               return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff,
-                                     sizeof(*inq_buff), 0xC1, scsi3addr,
-                                     TYPE_CMD);
+       return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff,
+                       sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD);
        if (return_code == IO_OK) {
                if (inq_buff->data_byte[8] == 0xFF) {
                        printk(KERN_WARNING
@@ -2723,7 +2717,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
 }
 
 static void
-cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
+cciss_read_capacity(int ctlr, int logvol, sector_t *total_size,
                    unsigned int *block_size)
 {
        ReadCapdata_struct *buf;
@@ -2737,14 +2731,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
        }
 
        log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-       if (withirq)
-               return_code = sendcmd_withirq(CCISS_READ_CAPACITY,
-                               ctlr, buf, sizeof(ReadCapdata_struct),
-                                       0, scsi3addr, TYPE_CMD);
-       else
-               return_code = sendcmd(CCISS_READ_CAPACITY,
-                               ctlr, buf, sizeof(ReadCapdata_struct),
-                                       0, scsi3addr, TYPE_CMD);
+       return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf,
+               sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD);
        if (return_code == IO_OK) {
                *total_size = be32_to_cpu(*(__be32 *) buf->total_size);
                *block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2756,8 +2744,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
        kfree(buf);
 }
 
-static void
-cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,                                unsigned int *block_size)
+static void cciss_read_capacity_16(int ctlr, int logvol,
+       sector_t *total_size, unsigned int *block_size)
 {
        ReadCapdata_struct_16 *buf;
        int return_code;
@@ -2770,16 +2758,9 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
        }
 
        log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
-       if (withirq) {
-               return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
-                       ctlr, buf, sizeof(ReadCapdata_struct_16),
-                               0, scsi3addr, TYPE_CMD);
-       }
-       else {
-               return_code = sendcmd(CCISS_READ_CAPACITY_16,
-                       ctlr, buf, sizeof(ReadCapdata_struct_16),
-                               0, scsi3addr, TYPE_CMD);
-       }
+       return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
+               ctlr, buf, sizeof(ReadCapdata_struct_16),
+                       0, scsi3addr, TYPE_CMD);
        if (return_code == IO_OK) {
                *total_size = be64_to_cpu(*(__be64 *) buf->total_size);
                *block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2820,13 +2801,13 @@ static int cciss_revalidate(struct gendisk *disk)
                return 1;
        }
        if (h->cciss_read == CCISS_READ_10) {
-               cciss_read_capacity(h->ctlr, logvol, 1,
+               cciss_read_capacity(h->ctlr, logvol,
                                        &total_size, &block_size);
        } else {
-               cciss_read_capacity_16(h->ctlr, logvol, 1,
+               cciss_read_capacity_16(h->ctlr, logvol,
                                        &total_size, &block_size);
        }
-       cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size,
+       cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size,
                               inq_buff, drv);
 
        blk_queue_logical_block_size(drv->queue, drv->block_size);
@@ -2837,167 +2818,6 @@ static int cciss_revalidate(struct gendisk *disk)
 }
 
 /*
- *   Wait polling for a command to complete.
- *   The memory mapped FIFO is polled for the completion.
- *   Used only at init time, interrupts from the HBA are disabled.
- */
-static unsigned long pollcomplete(int ctlr)
-{
-       unsigned long done;
-       int i;
-
-       /* Wait (up to 20 seconds) for a command to complete */
-
-       for (i = 20 * HZ; i > 0; i--) {
-               done = hba[ctlr]->access.command_completed(hba[ctlr]);
-               if (done == FIFO_EMPTY)
-                       schedule_timeout_uninterruptible(1);
-               else
-                       return done;
-       }
-       /* Invalid address to tell caller we ran out of time */
-       return 1;
-}
-
-/* Send command c to controller h and poll for it to complete.
- * Turns interrupts off on the board.  Used at driver init time
- * and during SCSI error recovery.
- */
-static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
-{
-       int i;
-       unsigned long complete;
-       int status = IO_ERROR;
-       u64bit buff_dma_handle;
-
-resend_cmd1:
-
-       /* Disable interrupt on the board. */
-       h->access.set_intr_mask(h, CCISS_INTR_OFF);
-
-       /* Make sure there is room in the command FIFO */
-       /* Actually it should be completely empty at this time */
-       /* unless we are in here doing error handling for the scsi */
-       /* tape side of the driver. */
-       for (i = 200000; i > 0; i--) {
-               /* if fifo isn't full go */
-               if (!(h->access.fifo_full(h)))
-                       break;
-               udelay(10);
-               printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full,"
-                      " waiting!\n", h->ctlr);
-       }
-       h->access.submit_command(h, c); /* Send the cmd */
-       do {
-               complete = pollcomplete(h->ctlr);
-
-#ifdef CCISS_DEBUG
-               printk(KERN_DEBUG "cciss: command completed\n");
-#endif                         /* CCISS_DEBUG */
-
-               if (complete == 1) {
-                       printk(KERN_WARNING
-                              "cciss cciss%d: SendCmd Timeout out, "
-                              "No command list address returned!\n", h->ctlr);
-                       status = IO_ERROR;
-                       break;
-               }
-
-               /* Make sure it's the command we're expecting. */
-               if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
-                       printk(KERN_WARNING "cciss%d: Unexpected command "
-                               "completion.\n", h->ctlr);
-                       continue;
-               }
-
-               /* It is our command.  If no error, we're done. */
-               if (!(complete & CISS_ERROR_BIT)) {
-                       status = IO_OK;
-                       break;
-               }
-
-               /* There is an error... */
-
-               /* if data overrun or underun on Report command ignore it */
-               if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
-                    (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
-                    (c->Request.CDB[0] == CISS_INQUIRY)) &&
-                       ((c->err_info->CommandStatus == CMD_DATA_OVERRUN) ||
-                        (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) {
-                       complete = c->busaddr;
-                       status = IO_OK;
-                       break;
-               }
-
-               if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) {
-                       printk(KERN_WARNING "cciss%d: unsolicited abort %p\n",
-                               h->ctlr, c);
-                       if (c->retry_count < MAX_CMD_RETRIES) {
-                               printk(KERN_WARNING "cciss%d: retrying %p\n",
-                                  h->ctlr, c);
-                               c->retry_count++;
-                               /* erase the old error information */
-                               memset(c->err_info, 0, sizeof(c->err_info));
-                               goto resend_cmd1;
-                       }
-                       printk(KERN_WARNING "cciss%d: retried %p too many "
-                               "times\n", h->ctlr, c);
-                       status = IO_ERROR;
-                       break;
-               }
-
-               if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
-                       printk(KERN_WARNING "cciss%d: command could not be "
-                               "aborted.\n", h->ctlr);
-                       status = IO_ERROR;
-                       break;
-               }
-
-               if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
-                       status = check_target_status(h, c);
-                       break;
-               }
-
-               printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
-               printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
-                       c->Request.CDB[0], c->err_info->CommandStatus);
-               status = IO_ERROR;
-               break;
-
-       } while (1);
-
-       /* unlock the data buffer from DMA */
-       buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
-       buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
-       pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
-                        c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
-       return status;
-}
-
-/*
- * Send a command to the controller, and wait for it to complete.
- * Used at init time, and during SCSI error recovery.
- */
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-       __u8 page_code, unsigned char *scsi3addr, int cmd_type)
-{
-       CommandList_struct *c;
-       int status;
-
-       c = cmd_alloc(hba[ctlr], 1);
-       if (!c) {
-               printk(KERN_WARNING "cciss: unable to get memory");
-               return IO_ERROR;
-       }
-       status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
-               scsi3addr, cmd_type);
-       if (status == IO_OK)
-               status = sendcmd_core(hba[ctlr], c);
-       cmd_free(hba[ctlr], c, 1);
-       return status;
-}
-
-/*
  * Map (physical) PCI mem into (virtual) kernel space
  */
 static void __iomem *remap_pci_mem(ulong base, ulong size)
@@ -3255,9 +3075,13 @@ static void do_cciss_request(struct request_queue *q)
        int seg;
        struct request *creq;
        u64bit temp64;
-       struct scatterlist tmp_sg[MAXSGENTRIES];
+       struct scatterlist *tmp_sg;
+       SGDescriptor_struct *curr_sg;
        drive_info_struct *drv;
        int i, dir;
+       int nseg = 0;
+       int sg_index = 0;
+       int chained = 0;
 
        /* We call start_io here in case there is a command waiting on the
         * queue that has not been sent.
@@ -3270,13 +3094,14 @@ static void do_cciss_request(struct request_queue *q)
        if (!creq)
                goto startio;
 
-       BUG_ON(creq->nr_phys_segments > MAXSGENTRIES);
+       BUG_ON(creq->nr_phys_segments > h->maxsgentries);
 
        if ((c = cmd_alloc(h, 1)) == NULL)
                goto full;
 
        blk_start_request(creq);
 
+       tmp_sg = h->scatter_list[c->cmdindex];
        spin_unlock_irq(q->queue_lock);
 
        c->cmd_type = CMD_RWREQ;
@@ -3305,7 +3130,7 @@ static void do_cciss_request(struct request_queue *q)
               (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
 #endif                         /* CCISS_DEBUG */
 
-       sg_init_table(tmp_sg, MAXSGENTRIES);
+       sg_init_table(tmp_sg, h->maxsgentries);
        seg = blk_rq_map_sg(q, creq, tmp_sg);
 
        /* get the DMA records for the setup */
@@ -3314,25 +3139,70 @@ static void do_cciss_request(struct request_queue *q)
        else
                dir = PCI_DMA_TODEVICE;
 
+       curr_sg = c->SG;
+       sg_index = 0;
+       chained = 0;
+
        for (i = 0; i < seg; i++) {
-               c->SG[i].Len = tmp_sg[i].length;
+               if (((sg_index+1) == (h->max_cmd_sgentries)) &&
+                       !chained && ((seg - i) > 1)) {
+                       nseg = seg - i;
+                       curr_sg[sg_index].Len = (nseg) *
+                                       sizeof(SGDescriptor_struct);
+                       curr_sg[sg_index].Ext = CCISS_SG_CHAIN;
+
+                       /* Point to next chain block. */
+                       curr_sg = h->cmd_sg_list[c->cmdindex]->sgchain;
+                       sg_index = 0;
+                       chained = 1;
+               }
+               curr_sg[sg_index].Len = tmp_sg[i].length;
                temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]),
-                                                 tmp_sg[i].offset,
-                                                 tmp_sg[i].length, dir);
-               c->SG[i].Addr.lower = temp64.val32.lower;
-               c->SG[i].Addr.upper = temp64.val32.upper;
-               c->SG[i].Ext = 0;       // we are not chaining
+                                               tmp_sg[i].offset,
+                                               tmp_sg[i].length, dir);
+               curr_sg[sg_index].Addr.lower = temp64.val32.lower;
+               curr_sg[sg_index].Addr.upper = temp64.val32.upper;
+               curr_sg[sg_index].Ext = 0;  /* we are not chaining */
+
+               ++sg_index;
+       }
+
+       if (chained) {
+               int len;
+               curr_sg = c->SG;
+               sg_index = h->max_cmd_sgentries - 1;
+               len = curr_sg[sg_index].Len;
+               /* Setup pointer to next chain block.
+                * Fill out last element in current chain
+                * block with address of next chain block.
+                */
+               temp64.val = pci_map_single(h->pdev,
+                                       h->cmd_sg_list[c->cmdindex]->sgchain,
+                                       len, dir);
+
+               h->cmd_sg_list[c->cmdindex]->sg_chain_dma = temp64.val;
+               curr_sg[sg_index].Addr.lower = temp64.val32.lower;
+               curr_sg[sg_index].Addr.upper = temp64.val32.upper;
+
+               pci_dma_sync_single_for_device(h->pdev,
+                               h->cmd_sg_list[c->cmdindex]->sg_chain_dma,
+                               len, dir);
        }
+
        /* track how many SG entries we are using */
        if (seg > h->maxSG)
                h->maxSG = seg;
 
 #ifdef CCISS_DEBUG
-       printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n",
-              blk_rq_sectors(creq), seg);
+       printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments "
+                       "chained[%d]\n",
+                       blk_rq_sectors(creq), seg, chained);
 #endif                         /* CCISS_DEBUG */
 
-       c->Header.SGList = c->Header.SGTotal = seg;
+       c->Header.SGList = c->Header.SGTotal = seg + chained;
+       if (seg > h->max_cmd_sgentries)
+               c->Header.SGList = h->max_cmd_sgentries;
+
        if (likely(blk_fs_request(creq))) {
                if(h->cciss_read == CCISS_READ_10) {
                        c->Request.CDB[1] = 0;
@@ -3513,28 +3383,33 @@ static int add_to_scan_list(struct ctlr_info *h)
  * @h:                    Pointer to the controller.
  *
  * Removes the controller from the rescan queue if present. Blocks if
- * the controller is currently conducting a rescan.
+ * the controller is currently conducting a rescan.  The controller
+ * can be in one of three states:
+ * 1. Doesn't need a scan
+ * 2. On the scan list, but not scanning yet (we remove it)
+ * 3. Busy scanning (and not on the list). In this case we want to wait for
+ *    the scan to complete to make sure the scanning thread for this
+ *    controller is completely idle.
  **/
 static void remove_from_scan_list(struct ctlr_info *h)
 {
        struct ctlr_info *test_h, *tmp_h;
-       int scanning = 0;
 
        mutex_lock(&scan_mutex);
        list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) {
-               if (test_h == h) {
+               if (test_h == h) { /* state 2. */
                        list_del(&h->scan_list);
                        complete_all(&h->scan_wait);
                        mutex_unlock(&scan_mutex);
                        return;
                }
        }
-       if (&h->busy_scanning)
-               scanning = 0;
-       mutex_unlock(&scan_mutex);
-
-       if (scanning)
+       if (h->busy_scanning) { /* state 3. */
+               mutex_unlock(&scan_mutex);
                wait_for_completion(&h->scan_wait);
+       } else { /* state 1, nothing to do. */
+               mutex_unlock(&scan_mutex);
+       }
 }
 
 /**
@@ -3573,13 +3448,11 @@ static int scan_thread(void *data)
                        h->busy_scanning = 1;
                        mutex_unlock(&scan_mutex);
 
-                       if (h) {
-                               rebuild_lun_table(h, 0, 0);
-                               complete_all(&h->scan_wait);
-                               mutex_lock(&scan_mutex);
-                               h->busy_scanning = 0;
-                               mutex_unlock(&scan_mutex);
-                       }
+                       rebuild_lun_table(h, 0, 0);
+                       complete_all(&h->scan_wait);
+                       mutex_lock(&scan_mutex);
+                       h->busy_scanning = 0;
+                       mutex_unlock(&scan_mutex);
                }
        }
 
@@ -3605,8 +3478,22 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c)
        case REPORT_LUNS_CHANGED:
                printk(KERN_WARNING "cciss%d: report LUN data "
                        "changed\n", h->ctlr);
-               add_to_scan_list(h);
-               wake_up_process(cciss_scan_thread);
+       /*
+        * Here, we could call add_to_scan_list and wake up the scan thread,
+        * except that it's quite likely that we will get more than one
+        * REPORT_LUNS_CHANGED condition in quick succession, which means
+        * that those which occur after the first one will likely happen
+        * *during* the scan_thread's rescan.  And the rescan code is not
+        * robust enough to restart in the middle, undoing what it has already
+        * done, and it's not clear that it's even possible to do this, since
+        * part of what it does is notify the block layer, which starts
+        * doing it's own i/o to read partition tables and so on, and the
+        * driver doesn't have visibility to know what might need undoing.
+        * In any event, if possible, it is horribly complicated to get right
+        * so we just don't do it for now.
+        *
+        * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012.
+        */
                return 1;
        break;
        case POWER_OR_RESET:
@@ -3888,6 +3775,23 @@ static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
         * leave a little room for ioctl calls.
         */
        c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
+       c->maxsgentries = readl(&(c->cfgtable->MaxSGElements));
+
+       /*
+        * Limit native command to 32 s/g elements to save dma'able memory.
+        * Howvever spec says if 0, use 31
+        */
+
+       c->max_cmd_sgentries = 31;
+       if (c->maxsgentries > 512) {
+               c->max_cmd_sgentries = 32;
+               c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1;
+               c->maxsgentries -= 1;   /* account for chain pointer */
+       } else {
+               c->maxsgentries = 31;   /* Default to traditional value */
+               c->chainsize = 0;       /* traditional */
+       }
+
        c->product_name = products[prod_index].product_name;
        c->access = *(products[prod_index].access);
        c->nr_cmds = c->max_commands - 4;
@@ -4214,6 +4118,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 {
        int i;
        int j = 0;
+       int k = 0;
        int rc;
        int dac, return_code;
        InquiryData_struct *inq_buff;
@@ -4317,6 +4222,53 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
                printk(KERN_ERR "cciss: out of memory");
                goto clean4;
        }
+
+       /* Need space for temp scatter list */
+       hba[i]->scatter_list = kmalloc(hba[i]->max_commands *
+                                               sizeof(struct scatterlist *),
+                                               GFP_KERNEL);
+       for (k = 0; k < hba[i]->nr_cmds; k++) {
+               hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
+                                                       hba[i]->maxsgentries,
+                                                       GFP_KERNEL);
+               if (hba[i]->scatter_list[k] == NULL) {
+                       printk(KERN_ERR "cciss%d: could not allocate "
+                               "s/g lists\n", i);
+                       goto clean4;
+               }
+       }
+       hba[i]->cmd_sg_list = kmalloc(sizeof(struct Cmd_sg_list *) *
+                                               hba[i]->nr_cmds,
+                                               GFP_KERNEL);
+       if (!hba[i]->cmd_sg_list) {
+               printk(KERN_ERR "cciss%d: Cannot get memory for "
+                       "s/g chaining.\n", i);
+               goto clean4;
+       }
+       /* Build up chain blocks for each command */
+       if (hba[i]->chainsize > 0) {
+               for (j = 0; j < hba[i]->nr_cmds; j++) {
+                       hba[i]->cmd_sg_list[j] =
+                                       kmalloc(sizeof(struct Cmd_sg_list),
+                                                       GFP_KERNEL);
+                       if (!hba[i]->cmd_sg_list[j]) {
+                               printk(KERN_ERR "cciss%d: Cannot get memory "
+                                       "for chain block.\n", i);
+                               goto clean4;
+                       }
+                       /* Need a block of chainsized s/g elements. */
+                       hba[i]->cmd_sg_list[j]->sgchain =
+                                       kmalloc((hba[i]->chainsize *
+                                               sizeof(SGDescriptor_struct)),
+                                               GFP_KERNEL);
+                       if (!hba[i]->cmd_sg_list[j]->sgchain) {
+                               printk(KERN_ERR "cciss%d: Cannot get memory "
+                                       "for s/g chains\n", i);
+                               goto clean4;
+                       }
+               }
+       }
+
        spin_lock_init(&hba[i]->lock);
 
        /* Initialize the pdev driver private data.
@@ -4362,7 +4314,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
        cciss_procinit(i);
 
-       hba[i]->cciss_max_sectors = 2048;
+       hba[i]->cciss_max_sectors = 8192;
 
        rebuild_lun_table(hba[i], 1, 0);
        hba[i]->busy_initializing = 0;
@@ -4370,6 +4322,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
 clean4:
        kfree(hba[i]->cmd_pool_bits);
+       /* Free up sg elements */
+       for (k = 0; k < hba[i]->nr_cmds; k++)
+               kfree(hba[i]->scatter_list[k]);
+       kfree(hba[i]->scatter_list);
+       /* Only free up extra s/g lists if controller supports them */
+       if (hba[i]->chainsize > 0) {
+               for (j = 0; j < hba[i]->nr_cmds; j++) {
+                       if (hba[i]->cmd_sg_list[j]) {
+                               kfree(hba[i]->cmd_sg_list[j]->sgchain);
+                               kfree(hba[i]->cmd_sg_list[j]);
+                       }
+               }
+               kfree(hba[i]->cmd_sg_list);
+       }
        if (hba[i]->cmd_pool)
                pci_free_consistent(hba[i]->pdev,
                                    hba[i]->nr_cmds * sizeof(CommandList_struct),
@@ -4400,30 +4366,28 @@ clean_no_release_regions:
 
 static void cciss_shutdown(struct pci_dev *pdev)
 {
-       ctlr_info_t *tmp_ptr;
-       int i;
-       char flush_buf[4];
+       ctlr_info_t *h;
+       char *flush_buf;
        int return_code;
 
-       tmp_ptr = pci_get_drvdata(pdev);
-       if (tmp_ptr == NULL)
-               return;
-       i = tmp_ptr->ctlr;
-       if (hba[i] == NULL)
+       h = pci_get_drvdata(pdev);
+       flush_buf = kzalloc(4, GFP_KERNEL);
+       if (!flush_buf) {
+               printk(KERN_WARNING
+                       "cciss:%d cache not flushed, out of memory.\n",
+                       h->ctlr);
                return;
-
-       /* Turn board interrupts off  and send the flush cache command */
-       /* sendcmd will turn off interrupt, and send the flush...
-        * To write all data in the battery backed cache to disks */
-       memset(flush_buf, 0, 4);
-       return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0,
-               CTLR_LUNID, TYPE_CMD);
-       if (return_code == IO_OK) {
-               printk(KERN_INFO "Completed flushing cache on controller %d\n", i);
-       } else {
-               printk(KERN_WARNING "Error flushing cache on controller %d\n", i);
        }
-       free_irq(hba[i]->intr[2], hba[i]);
+       /* write all data in the battery backed cache to disk */
+       memset(flush_buf, 0, 4);
+       return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf,
+               4, 0, CTLR_LUNID, TYPE_CMD);
+       kfree(flush_buf);
+       if (return_code != IO_OK)
+               printk(KERN_WARNING "cciss%d: Error flushing cache\n",
+                       h->ctlr);
+       h->access.set_intr_mask(h, CCISS_INTR_OFF);
+       free_irq(h->intr[2], h);
 }
 
 static void __devexit cciss_remove_one(struct pci_dev *pdev)
@@ -4485,6 +4449,20 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
        pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
                            hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
        kfree(hba[i]->cmd_pool_bits);
+       /* Free up sg elements */
+       for (j = 0; j < hba[i]->nr_cmds; j++)
+               kfree(hba[i]->scatter_list[j]);
+       kfree(hba[i]->scatter_list);
+       /* Only free up extra s/g lists if controller supports them */
+       if (hba[i]->chainsize > 0) {
+               for (j = 0; j < hba[i]->nr_cmds; j++) {
+                       if (hba[i]->cmd_sg_list[j]) {
+                               kfree(hba[i]->cmd_sg_list[j]->sgchain);
+                               kfree(hba[i]->cmd_sg_list[j]);
+                       }
+               }
+               kfree(hba[i]->cmd_sg_list);
+       }
        /*
         * Deliberately omit pci_disable_device(): it does something nasty to
         * Smart Array controllers that pci_enable_device does not undo
index 31524cf..1d95db2 100644 (file)
@@ -55,7 +55,13 @@ typedef struct _drive_info_struct
        char device_initialized;     /* indicates whether dev is initialized */
 } drive_info_struct;
 
-struct ctlr_info 
+struct Cmd_sg_list {
+       SGDescriptor_struct     *sgchain;
+       dma_addr_t              sg_chain_dma;
+       int                     chain_block_size;
+};
+
+struct ctlr_info
 {
        int     ctlr;
        char    devname[8];
@@ -75,6 +81,16 @@ struct ctlr_info
        int     num_luns;
        int     highest_lun;
        int     usage_count;  /* number of opens all all minor devices */
+       /* Need space for temp sg list
+        * number of scatter/gathers supported
+        * number of scatter/gathers in chained block
+        */
+       struct  scatterlist **scatter_list;
+       int     maxsgentries;
+       int     chainsize;
+       int     max_cmd_sgentries;
+       struct Cmd_sg_list **cmd_sg_list;
+
 #      define DOORBELL_INT     0
 #      define PERF_MODE_INT    1
 #      define SIMPLE_MODE_INT  2
index dbaed1e..b50a9b2 100644 (file)
@@ -7,7 +7,8 @@
 
 //general boundary defintions
 #define SENSEINFOBYTES          32//note that this value may vary between host implementations
-#define MAXSGENTRIES            31
+#define MAXSGENTRIES            32
+#define CCISS_SG_CHAIN          0x80000000
 #define MAXREPLYQS              256
 
 //Command Status value
@@ -319,6 +320,10 @@ typedef struct _CfgTable_struct {
   BYTE             ServerName[16];
   DWORD            HeartBeat;
   DWORD            SCSI_Prefetch;
+  DWORD            MaxSGElements;
+  DWORD            MaxLogicalUnits;
+  DWORD            MaxPhysicalDrives;
+  DWORD            MaxPhysicalDrivesPerLogicalUnit;
 } CfgTable_struct;
 #pragma pack()  
 #endif // CCISS_CMD_H
index 3315268..5d0e46d 100644 (file)
@@ -755,7 +755,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
                                                cp,  
                                                ei->ScsiStatus); 
 #endif
-                                       cmd->result |= (ei->ScsiStatus < 1);
+                                       cmd->result |= (ei->ScsiStatus << 1);
                                }
                                else {  /* scsi status is zero??? How??? */
                                        
@@ -1547,7 +1547,7 @@ cciss_engage_scsi(int ctlr)
        if (sa->registered) {
                printk("cciss%d: SCSI subsystem already engaged.\n", ctlr);
                spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
-               return ENXIO;
+               return -ENXIO;
        }
        sa->registered = 1;
        spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644 (file)
index 0000000..f4acd04
--- /dev/null
@@ -0,0 +1,71 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
+       depends on !PROC_FS || !INET || !CONNECTOR
+
+config BLK_DEV_DRBD
+       tristate "DRBD Distributed Replicated Block Device support"
+       depends on PROC_FS && INET && CONNECTOR
+       select LRU_CACHE
+       default n
+       help
+
+         NOTE: In order to authenticate connections you have to select
+         CRYPTO_HMAC and a hash function as well.
+
+         DRBD is a shared-nothing, synchronously replicated block device. It
+         is designed to serve as a building block for high availability
+         clusters and in this context, is a "drop-in" replacement for shared
+         storage. Simplistically, you could see it as a network RAID 1.
+
+         Each minor device has a role, which can be 'primary' or 'secondary'.
+         On the node with the primary device the application is supposed to
+         run and to access the device (/dev/drbdX). Every write is sent to
+         the local 'lower level block device' and, across the network, to the
+         node with the device in 'secondary' state.  The secondary device
+         simply writes the data to its lower level block device.
+
+         DRBD can also be used in dual-Primary mode (device writable on both
+         nodes), which means it can exhibit shared disk semantics in a
+         shared-nothing cluster.  Needless to say, on top of dual-Primary
+         DRBD utilizing a cluster file system is necessary to maintain for
+         cache coherency.
+
+         For automatic failover you need a cluster manager (e.g. heartbeat).
+         See also: http://www.drbd.org/, http://www.linux-ha.org
+
+         If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+       bool "DRBD fault injection"
+       depends on BLK_DEV_DRBD
+       help
+
+         Say Y here if you want to simulate IO errors, in order to test DRBD's
+         behavior.
+
+         The actual simulation of IO errors is done by writing 3 values to
+         /sys/module/drbd/parameters/
+
+         enable_faults: bitmask of...
+         1     meta data write
+         2               read
+         4     resync data write
+         8                 read
+         16    data write
+         32    data read
+         64    read ahead
+         128   kmalloc of bitmap
+         256   allocation of EE (epoch_entries)
+
+         fault_devs: bitmask of minor numbers
+         fault_rate: frequency in percent
+
+         Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+               echo 16 > /sys/module/drbd/parameters/enable_faults
+               echo 1 > /sys/module/drbd/parameters/fault_devs
+               echo 5 > /sys/module/drbd/parameters/fault_rate
+
+         If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644 (file)
index 0000000..0d3f337
--- /dev/null
@@ -0,0 +1,5 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644 (file)
index 0000000..17956ff
--- /dev/null
@@ -0,0 +1,1424 @@
+/*
+   drbd_actlog.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_wrappers.h"
+
+/* We maintain a trivial check sum in our on disk activity log.
+ * With that we can ensure correct operation even when the storage
+ * device might do a partial (last) sector write while loosing power.
+ */
+struct __packed al_transaction {
+       u32       magic;
+       u32       tr_number;
+       struct __packed {
+               u32 pos;
+               u32 extent; } updates[1 + AL_EXTENTS_PT];
+       u32       xor_sum;
+};
+
+struct update_odbm_work {
+       struct drbd_work w;
+       unsigned int enr;
+};
+
+struct update_al_work {
+       struct drbd_work w;
+       struct lc_element *al_ext;
+       struct completion event;
+       unsigned int enr;
+       /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
+       unsigned int old_enr;
+};
+
+struct drbd_atodb_wait {
+       atomic_t           count;
+       struct completion  io_done;
+       struct drbd_conf   *mdev;
+       int                error;
+};
+
+
+int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+
+static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
+                                struct drbd_backing_dev *bdev,
+                                struct page *page, sector_t sector,
+                                int rw, int size)
+{
+       struct bio *bio;
+       struct drbd_md_io md_io;
+       int ok;
+
+       md_io.mdev = mdev;
+       init_completion(&md_io.event);
+       md_io.error = 0;
+
+       if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
+               rw |= (1 << BIO_RW_BARRIER);
+       rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
+
+ retry:
+       bio = bio_alloc(GFP_NOIO, 1);
+       bio->bi_bdev = bdev->md_bdev;
+       bio->bi_sector = sector;
+       ok = (bio_add_page(bio, page, size, 0) == size);
+       if (!ok)
+               goto out;
+       bio->bi_private = &md_io;
+       bio->bi_end_io = drbd_md_io_complete;
+       bio->bi_rw = rw;
+
+       if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+               bio_endio(bio, -EIO);
+       else
+               submit_bio(rw, bio);
+       wait_for_completion(&md_io.event);
+       ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+
+       /* check for unsupported barrier op.
+        * would rather check on EOPNOTSUPP, but that is not reliable.
+        * don't try again for ANY return value != 0 */
+       if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
+               /* Try again with no barrier */
+               dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
+               set_bit(MD_NO_BARRIER, &mdev->flags);
+               rw &= ~(1 << BIO_RW_BARRIER);
+               bio_put(bio);
+               goto retry;
+       }
+ out:
+       bio_put(bio);
+       return ok;
+}
+
+int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+                        sector_t sector, int rw)
+{
+       int logical_block_size, mask, ok;
+       int offset = 0;
+       struct page *iop = mdev->md_io_page;
+
+       D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+
+       BUG_ON(!bdev->md_bdev);
+
+       logical_block_size = bdev_logical_block_size(bdev->md_bdev);
+       if (logical_block_size == 0)
+               logical_block_size = MD_SECTOR_SIZE;
+
+       /* in case logical_block_size != 512 [ s390 only? ] */
+       if (logical_block_size != MD_SECTOR_SIZE) {
+               mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
+               D_ASSERT(mask == 1 || mask == 3 || mask == 7);
+               D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
+               offset = sector & mask;
+               sector = sector & ~mask;
+               iop = mdev->md_io_tmpp;
+
+               if (rw & WRITE) {
+                       /* these are GFP_KERNEL pages, pre-allocated
+                        * on device initialization */
+                       void *p = page_address(mdev->md_io_page);
+                       void *hp = page_address(mdev->md_io_tmpp);
+
+                       ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
+                                       READ, logical_block_size);
+
+                       if (unlikely(!ok)) {
+                               dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
+                                   "READ [logical_block_size!=512]) failed!\n",
+                                   (unsigned long long)sector);
+                               return 0;
+                       }
+
+                       memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
+               }
+       }
+
+       if (sector < drbd_md_first_sector(bdev) ||
+           sector > drbd_md_last_sector(bdev))
+               dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+                    current->comm, current->pid, __func__,
+                    (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+       ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
+       if (unlikely(!ok)) {
+               dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
+                   (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+               return 0;
+       }
+
+       if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
+               void *p = page_address(mdev->md_io_page);
+               void *hp = page_address(mdev->md_io_tmpp);
+
+               memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
+       }
+
+       return ok;
+}
+
+static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
+{
+       struct lc_element *al_ext;
+       struct lc_element *tmp;
+       unsigned long     al_flags = 0;
+
+       spin_lock_irq(&mdev->al_lock);
+       tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
+       if (unlikely(tmp != NULL)) {
+               struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+               if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+                       spin_unlock_irq(&mdev->al_lock);
+                       return NULL;
+               }
+       }
+       al_ext   = lc_get(mdev->act_log, enr);
+       al_flags = mdev->act_log->flags;
+       spin_unlock_irq(&mdev->al_lock);
+
+       /*
+       if (!al_ext) {
+               if (al_flags & LC_STARVING)
+                       dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
+               if (al_flags & LC_DIRTY)
+                       dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
+       }
+       */
+
+       return al_ext;
+}
+
+void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+       unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+       struct lc_element *al_ext;
+       struct update_al_work al_work;
+
+       D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
+
+       wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
+
+       if (al_ext->lc_number != enr) {
+               /* drbd_al_write_transaction(mdev,al_ext,enr);
+                * recurses into generic_make_request(), which
+                * disallows recursion, bios being serialized on the
+                * current->bio_tail list now.
+                * we have to delegate updates to the activity log
+                * to the worker thread. */
+               init_completion(&al_work.event);
+               al_work.al_ext = al_ext;
+               al_work.enr = enr;
+               al_work.old_enr = al_ext->lc_number;
+               al_work.w.cb = w_al_write_transaction;
+               drbd_queue_work_front(&mdev->data.work, &al_work.w);
+               wait_for_completion(&al_work.event);
+
+               mdev->al_writ_cnt++;
+
+               spin_lock_irq(&mdev->al_lock);
+               lc_changed(mdev->act_log, al_ext);
+               spin_unlock_irq(&mdev->al_lock);
+               wake_up(&mdev->al_wait);
+       }
+}
+
+void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+       unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+       struct lc_element *extent;
+       unsigned long flags;
+
+       spin_lock_irqsave(&mdev->al_lock, flags);
+
+       extent = lc_find(mdev->act_log, enr);
+
+       if (!extent) {
+               spin_unlock_irqrestore(&mdev->al_lock, flags);
+               dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
+               return;
+       }
+
+       if (lc_put(mdev->act_log, extent) == 0)
+               wake_up(&mdev->al_wait);
+
+       spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+int
+w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+       struct update_al_work *aw = container_of(w, struct update_al_work, w);
+       struct lc_element *updated = aw->al_ext;
+       const unsigned int new_enr = aw->enr;
+       const unsigned int evicted = aw->old_enr;
+       struct al_transaction *buffer;
+       sector_t sector;
+       int i, n, mx;
+       unsigned int extent_nr;
+       u32 xor_sum = 0;
+
+       if (!get_ldev(mdev)) {
+               dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
+               complete(&((struct update_al_work *)w)->event);
+               return 1;
+       }
+       /* do we have to do a bitmap write, first?
+        * TODO reduce maximum latency:
+        * submit both bios, then wait for both,
+        * instead of doing two synchronous sector writes. */
+       if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
+               drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
+
+       mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
+       buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+
+       buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+       buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
+
+       n = lc_index_of(mdev->act_log, updated);
+
+       buffer->updates[0].pos = cpu_to_be32(n);
+       buffer->updates[0].extent = cpu_to_be32(new_enr);
+
+       xor_sum ^= new_enr;
+
+       mx = min_t(int, AL_EXTENTS_PT,
+                  mdev->act_log->nr_elements - mdev->al_tr_cycle);
+       for (i = 0; i < mx; i++) {
+               unsigned idx = mdev->al_tr_cycle + i;
+               extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
+               buffer->updates[i+1].pos = cpu_to_be32(idx);
+               buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
+               xor_sum ^= extent_nr;
+       }
+       for (; i < AL_EXTENTS_PT; i++) {
+               buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
+               buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
+               xor_sum ^= LC_FREE;
+       }
+       mdev->al_tr_cycle += AL_EXTENTS_PT;
+       if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
+               mdev->al_tr_cycle = 0;
+
+       buffer->xor_sum = cpu_to_be32(xor_sum);
+
+       sector =  mdev->ldev->md.md_offset
+               + mdev->ldev->md.al_offset + mdev->al_tr_pos;
+
+       if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
+               drbd_chk_io_error(mdev, 1, TRUE);
+
+       if (++mdev->al_tr_pos >
+           div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+               mdev->al_tr_pos = 0;
+
+       D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
+       mdev->al_tr_number++;
+
+       mutex_unlock(&mdev->md_io_mutex);
+
+       complete(&((struct update_al_work *)w)->event);
+       put_ldev(mdev);
+
+       return 1;
+}
+
+/**
+ * drbd_al_read_tr() - Read a single transaction from the on disk activity log
+ * @mdev:      DRBD device.
+ * @bdev:      Block device to read form.
+ * @b:         pointer to an al_transaction.
+ * @index:     On disk slot of the transaction to read.
+ *
+ * Returns -1 on IO error, 0 on checksum error and 1 upon success.
+ */
+static int drbd_al_read_tr(struct drbd_conf *mdev,
+                          struct drbd_backing_dev *bdev,
+                          struct al_transaction *b,
+                          int index)
+{
+       sector_t sector;
+       int rv, i;
+       u32 xor_sum = 0;
+
+       sector = bdev->md.md_offset + bdev->md.al_offset + index;
+
+       /* Dont process error normally,
+        * as this is done before disk is attached! */
+       if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
+               return -1;
+
+       rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
+
+       for (i = 0; i < AL_EXTENTS_PT + 1; i++)
+               xor_sum ^= be32_to_cpu(b->updates[i].extent);
+       rv &= (xor_sum == be32_to_cpu(b->xor_sum));
+
+       return rv;
+}
+
+/**
+ * drbd_al_read_log() - Restores the activity log from its on disk representation.
+ * @mdev:      DRBD device.
+ * @bdev:      Block device to read form.
+ *
+ * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
+ */
+int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+{
+       struct al_transaction *buffer;
+       int i;
+       int rv;
+       int mx;
+       int active_extents = 0;
+       int transactions = 0;
+       int found_valid = 0;
+       int from = 0;
+       int to = 0;
+       u32 from_tnr = 0;
+       u32 to_tnr = 0;
+       u32 cnr;
+
+       mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
+
+       /* lock out all other meta data io for now,
+        * and make sure the page is mapped.
+        */
+       mutex_lock(&mdev->md_io_mutex);
+       buffer = page_address(mdev->md_io_page);
+
+       /* Find the valid transaction in the log */
+       for (i = 0; i <= mx; i++) {
+               rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+               if (rv == 0)
+                       continue;
+               if (rv == -1) {
+                       mutex_unlock(&mdev->md_io_mutex);
+                       return 0;
+               }
+               cnr = be32_to_cpu(buffer->tr_number);
+
+               if (++found_valid == 1) {
+                       from = i;
+                       to = i;
+                       from_tnr = cnr;
+                       to_tnr = cnr;
+                       continue;
+               }
+               if ((int)cnr - (int)from_tnr < 0) {
+                       D_ASSERT(from_tnr - cnr + i - from == mx+1);
+                       from = i;
+                       from_tnr = cnr;
+               }
+               if ((int)cnr - (int)to_tnr > 0) {
+                       D_ASSERT(cnr - to_tnr == i - to);
+                       to = i;
+                       to_tnr = cnr;
+               }
+       }
+
+       if (!found_valid) {
+               dev_warn(DEV, "No usable activity log found.\n");
+               mutex_unlock(&mdev->md_io_mutex);
+               return 1;
+       }
+
+       /* Read the valid transactions.
+        * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
+       i = from;
+       while (1) {
+               int j, pos;
+               unsigned int extent_nr;
+               unsigned int trn;
+
+               rv = drbd_al_read_tr(mdev, bdev, buffer, i);
+               ERR_IF(rv == 0) goto cancel;
+               if (rv == -1) {
+                       mutex_unlock(&mdev->md_io_mutex);
+                       return 0;
+               }
+
+               trn = be32_to_cpu(buffer->tr_number);
+
+               spin_lock_irq(&mdev->al_lock);
+
+               /* This loop runs backwards because in the cyclic
+                  elements there might be an old version of the
+                  updated element (in slot 0). So the element in slot 0
+                  can overwrite old versions. */
+               for (j = AL_EXTENTS_PT; j >= 0; j--) {
+                       pos = be32_to_cpu(buffer->updates[j].pos);
+                       extent_nr = be32_to_cpu(buffer->updates[j].extent);
+
+                       if (extent_nr == LC_FREE)
+                               continue;
+
+                       lc_set(mdev->act_log, extent_nr, pos);
+                       active_extents++;
+               }
+               spin_unlock_irq(&mdev->al_lock);
+
+               transactions++;
+
+cancel:
+               if (i == to)
+                       break;
+               i++;
+               if (i > mx)
+                       i = 0;
+       }
+
+       mdev->al_tr_number = to_tnr+1;
+       mdev->al_tr_pos = to;
+       if (++mdev->al_tr_pos >
+           div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
+               mdev->al_tr_pos = 0;
+
+       /* ok, we are done with it */
+       mutex_unlock(&mdev->md_io_mutex);
+
+       dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
+            transactions, active_extents);
+
+       return 1;
+}
+
+static void atodb_endio(struct bio *bio, int error)
+{
+       struct drbd_atodb_wait *wc = bio->bi_private;
+       struct drbd_conf *mdev = wc->mdev;
+       struct page *page;
+       int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+       /* strange behavior of some lower level drivers...
+        * fail the request by clearing the uptodate flag,
+        * but do not return any error?! */
+       if (!error && !uptodate)
+               error = -EIO;
+
+       drbd_chk_io_error(mdev, error, TRUE);
+       if (error && wc->error == 0)
+               wc->error = error;
+
+       if (atomic_dec_and_test(&wc->count))
+               complete(&wc->io_done);
+
+       page = bio->bi_io_vec[0].bv_page;
+       put_page(page);
+       bio_put(bio);
+       mdev->bm_writ_cnt++;
+       put_ldev(mdev);
+}
+
+#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* activity log to on disk bitmap -- prepare bio unless that sector
+ * is already covered by previously prepared bios */
+static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
+                                       struct bio **bios,
+                                       unsigned int enr,
+                                       struct drbd_atodb_wait *wc) __must_hold(local)
+{
+       struct bio *bio;
+       struct page *page;
+       sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+                                     + mdev->ldev->md.bm_offset;
+       unsigned int page_offset = PAGE_SIZE;
+       int offset;
+       int i = 0;
+       int err = -ENOMEM;
+
+       /* Check if that enr is already covered by an already created bio.
+        * Caution, bios[] is not NULL terminated,
+        * but only initialized to all NULL.
+        * For completely scattered activity log,
+        * the last invocation iterates over all bios,
+        * and finds the last NULL entry.
+        */
+       while ((bio = bios[i])) {
+               if (bio->bi_sector == on_disk_sector)
+                       return 0;
+               i++;
+       }
+       /* bios[i] == NULL, the next not yet used slot */
+
+       /* GFP_KERNEL, we are not in the write-out path */
+       bio = bio_alloc(GFP_KERNEL, 1);
+       if (bio == NULL)
+               return -ENOMEM;
+
+       if (i > 0) {
+               const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
+               page_offset = prev_bv->bv_offset + prev_bv->bv_len;
+               page = prev_bv->bv_page;
+       }
+       if (page_offset == PAGE_SIZE) {
+               page = alloc_page(__GFP_HIGHMEM);
+               if (page == NULL)
+                       goto out_bio_put;
+               page_offset = 0;
+       } else {
+               get_page(page);
+       }
+
+       offset = S2W(enr);
+       drbd_bm_get_lel(mdev, offset,
+                       min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
+                       kmap(page) + page_offset);
+       kunmap(page);
+
+       bio->bi_private = wc;
+       bio->bi_end_io = atodb_endio;
+       bio->bi_bdev = mdev->ldev->md_bdev;
+       bio->bi_sector = on_disk_sector;
+
+       if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE)
+               goto out_put_page;
+
+       atomic_inc(&wc->count);
+       /* we already know that we may do this...
+        * get_ldev_if_state(mdev,D_ATTACHING);
+        * just get the extra reference, so that the local_cnt reflects
+        * the number of pending IO requests DRBD at its backing device.
+        */
+       atomic_inc(&mdev->local_cnt);
+
+       bios[i] = bio;
+
+       return 0;
+
+out_put_page:
+       err = -EINVAL;
+       put_page(page);
+out_bio_put:
+       bio_put(bio);
+       return err;
+}
+
+/**
+ * drbd_al_to_on_disk_bm() -  * Writes bitmap parts covered by active AL extents
+ * @mdev:      DRBD device.
+ *
+ * Called when we detach (unconfigure) local storage,
+ * or when we go from R_PRIMARY to R_SECONDARY role.
+ */
+void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
+{
+       int i, nr_elements;
+       unsigned int enr;
+       struct bio **bios;
+       struct drbd_atodb_wait wc;
+
+       ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
+               return; /* sorry, I don't have any act_log etc... */
+
+       wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+       nr_elements = mdev->act_log->nr_elements;
+
+       /* GFP_KERNEL, we are not in anyone's write-out path */
+       bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
+       if (!bios)
+               goto submit_one_by_one;
+
+       atomic_set(&wc.count, 0);
+       init_completion(&wc.io_done);
+       wc.mdev = mdev;
+       wc.error = 0;
+
+       for (i = 0; i < nr_elements; i++) {
+               enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+               if (enr == LC_FREE)
+                       continue;
+               /* next statement also does atomic_inc wc.count and local_cnt */
+               if (atodb_prepare_unless_covered(mdev, bios,
+                                               enr/AL_EXT_PER_BM_SECT,
+                                               &wc))
+                       goto free_bios_submit_one_by_one;
+       }
+
+       /* unnecessary optimization? */
+       lc_unlock(mdev->act_log);
+       wake_up(&mdev->al_wait);
+
+       /* all prepared, submit them */
+       for (i = 0; i < nr_elements; i++) {
+               if (bios[i] == NULL)
+                       break;
+               if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
+                       bios[i]->bi_rw = WRITE;
+                       bio_endio(bios[i], -EIO);
+               } else {
+                       submit_bio(WRITE, bios[i]);
+               }
+       }
+
+       drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
+
+       /* always (try to) flush bitmap to stable storage */
+       drbd_md_flush(mdev);
+
+       /* In case we did not submit a single IO do not wait for
+        * them to complete. ( Because we would wait forever here. )
+        *
+        * In case we had IOs and they are already complete, there
+        * is not point in waiting anyways.
+        * Therefore this if () ... */
+       if (atomic_read(&wc.count))
+               wait_for_completion(&wc.io_done);
+
+       put_ldev(mdev);
+
+       kfree(bios);
+       return;
+
+ free_bios_submit_one_by_one:
+       /* free everything by calling the endio callback directly. */
+       for (i = 0; i < nr_elements && bios[i]; i++)
+               bio_endio(bios[i], 0);
+
+       kfree(bios);
+
+ submit_one_by_one:
+       dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
+
+       for (i = 0; i < mdev->act_log->nr_elements; i++) {
+               enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+               if (enr == LC_FREE)
+                       continue;
+               /* Really slow: if we have al-extents 16..19 active,
+                * sector 4 will be written four times! Synchronous! */
+               drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
+       }
+
+       lc_unlock(mdev->act_log);
+       wake_up(&mdev->al_wait);
+       put_ldev(mdev);
+}
+
+/**
+ * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
+ * @mdev:      DRBD device.
+ */
+void drbd_al_apply_to_bm(struct drbd_conf *mdev)
+{
+       unsigned int enr;
+       unsigned long add = 0;
+       char ppb[10];
+       int i;
+
+       wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+       for (i = 0; i < mdev->act_log->nr_elements; i++) {
+               enr = lc_element_by_index(mdev->act_log, i)->lc_number;
+               if (enr == LC_FREE)
+                       continue;
+               add += drbd_bm_ALe_set_all(mdev, enr);
+       }
+
+       lc_unlock(mdev->act_log);
+       wake_up(&mdev->al_wait);
+
+       dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
+            ppsize(ppb, Bit2KB(add)));
+}
+
+static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
+{
+       int rv;
+
+       spin_lock_irq(&mdev->al_lock);
+       rv = (al_ext->refcnt == 0);
+       if (likely(rv))
+               lc_del(mdev->act_log, al_ext);
+       spin_unlock_irq(&mdev->al_lock);
+
+       return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @mdev:      DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_conf *mdev)
+{
+       struct lc_element *al_ext;
+       int i;
+
+       D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+
+       for (i = 0; i < mdev->act_log->nr_elements; i++) {
+               al_ext = lc_element_by_index(mdev->act_log, i);
+               if (al_ext->lc_number == LC_FREE)
+                       continue;
+               wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
+       }
+
+       wake_up(&mdev->al_wait);
+}
+
+static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+       struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+
+       if (!get_ldev(mdev)) {
+               if (__ratelimit(&drbd_ratelimit_state))
+                       dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
+               kfree(udw);
+               return 1;
+       }
+
+       drbd_bm_write_sect(mdev, udw->enr);
+       put_ldev(mdev);
+
+       kfree(udw);
+
+       if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
+               switch (mdev->state.conn) {
+               case C_SYNC_SOURCE:  case C_SYNC_TARGET:
+               case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
+                       drbd_resync_finished(mdev);
+               default:
+                       /* nothing to do */
+                       break;
+               }
+       }
+       drbd_bcast_sync_progress(mdev);
+
+       return 1;
+}
+
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
+                                     int count, int success)
+{
+       struct lc_element *e;
+       struct update_odbm_work *udw;
+
+       unsigned int enr;
+
+       D_ASSERT(atomic_read(&mdev->local_cnt));
+
+       /* I simply assume that a sector/size pair never crosses
+        * a 16 MB extent border. (Currently this is true...) */
+       enr = BM_SECT_TO_EXT(sector);
+
+       e = lc_get(mdev->resync, enr);
+       if (e) {
+               struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+               if (ext->lce.lc_number == enr) {
+                       if (success)
+                               ext->rs_left -= count;
+                       else
+                               ext->rs_failed += count;
+                       if (ext->rs_left < ext->rs_failed) {
+                               dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
+                                   "rs_failed=%d count=%d\n",
+                                    (unsigned long long)sector,
+                                    ext->lce.lc_number, ext->rs_left,
+                                    ext->rs_failed, count);
+                               dump_stack();
+
+                               lc_put(mdev->resync, &ext->lce);
+                               drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+                               return;
+                       }
+               } else {
+                       /* Normally this element should be in the cache,
+                        * since drbd_rs_begin_io() pulled it already in.
+                        *
+                        * But maybe an application write finished, and we set
+                        * something outside the resync lru_cache in sync.
+                        */
+                       int rs_left = drbd_bm_e_weight(mdev, enr);
+                       if (ext->flags != 0) {
+                               dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
+                                    " -> %d[%u;00]\n",
+                                    ext->lce.lc_number, ext->rs_left,
+                                    ext->flags, enr, rs_left);
+                               ext->flags = 0;
+                       }
+                       if (ext->rs_failed) {
+                               dev_warn(DEV, "Kicking resync_lru element enr=%u "
+                                    "out with rs_failed=%d\n",
+                                    ext->lce.lc_number, ext->rs_failed);
+                               set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+                       }
+                       ext->rs_left = rs_left;
+                       ext->rs_failed = success ? 0 : count;
+                       lc_changed(mdev->resync, &ext->lce);
+               }
+               lc_put(mdev->resync, &ext->lce);
+               /* no race, we are within the al_lock! */
+
+               if (ext->rs_left == ext->rs_failed) {
+                       ext->rs_failed = 0;
+
+                       udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
+                       if (udw) {
+                               udw->enr = ext->lce.lc_number;
+                               udw->w.cb = w_update_odbm;
+                               drbd_queue_work_front(&mdev->data.work, &udw->w);
+                       } else {
+                               dev_warn(DEV, "Could not kmalloc an udw\n");
+                               set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
+                       }
+               }
+       } else {
+               dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
+                   mdev->resync_locked,
+                   mdev->resync->nr_elements,
+                   mdev->resync->flags);
+       }
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
+                      const char *file, const unsigned int line)
+{
+       /* Is called from worker and receiver context _only_ */
+       unsigned long sbnr, ebnr, lbnr;
+       unsigned long count = 0;
+       sector_t esector, nr_sectors;
+       int wake_up = 0;
+       unsigned long flags;
+
+       if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+               dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
+                               (unsigned long long)sector, size);
+               return;
+       }
+       nr_sectors = drbd_get_capacity(mdev->this_bdev);
+       esector = sector + (size >> 9) - 1;
+
+       ERR_IF(sector >= nr_sectors) return;
+       ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+       lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+       /* we clear it (in sync).
+        * round up start sector, round down end sector.  we make sure we only
+        * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
+       if (unlikely(esector < BM_SECT_PER_BIT-1))
+               return;
+       if (unlikely(esector == (nr_sectors-1)))
+               ebnr = lbnr;
+       else
+               ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+       sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+       if (sbnr > ebnr)
+               return;
+
+       /*
+        * ok, (capacity & 7) != 0 sometimes, but who cares...
+        * we count rs_{total,left} in bits, not sectors.
+        */
+       spin_lock_irqsave(&mdev->al_lock, flags);
+       count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
+       if (count) {
+               /* we need the lock for drbd_try_clear_on_disk_bm */
+               if (jiffies - mdev->rs_mark_time > HZ*10) {
+                       /* should be rolling marks,
+                        * but we estimate only anyways. */
+                       if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
+                           mdev->state.conn != C_PAUSED_SYNC_T &&
+                           mdev->state.conn != C_PAUSED_SYNC_S) {
+                               mdev->rs_mark_time = jiffies;
+                               mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+                       }
+               }
+               if (get_ldev(mdev)) {
+                       drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
+                       put_ldev(mdev);
+               }
+               /* just wake_up unconditional now, various lc_chaged(),
+                * lc_put() in drbd_try_clear_on_disk_bm(). */
+               wake_up = 1;
+       }
+       spin_unlock_irqrestore(&mdev->al_lock, flags);
+       if (wake_up)
+               wake_up(&mdev->al_wait);
+}
+
+/*
+ * this is intended to set one request worth of data out of sync.
+ * affects at least 1 bit,
+ * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
+ *
+ * called by tl_clear and drbd_send_dblock (==drbd_make_request).
+ * so this can be _any_ process.
+ */
+void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
+                           const char *file, const unsigned int line)
+{
+       unsigned long sbnr, ebnr, lbnr, flags;
+       sector_t esector, nr_sectors;
+       unsigned int enr, count;
+       struct lc_element *e;
+
+       if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+               dev_err(DEV, "sector: %llus, size: %d\n",
+                       (unsigned long long)sector, size);
+               return;
+       }
+
+       if (!get_ldev(mdev))
+               return; /* no disk, no metadata, no bitmap to set bits in */
+
+       nr_sectors = drbd_get_capacity(mdev->this_bdev);
+       esector = sector + (size >> 9) - 1;
+
+       ERR_IF(sector >= nr_sectors)
+               goto out;
+       ERR_IF(esector >= nr_sectors)
+               esector = (nr_sectors-1);
+
+       lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+       /* we set it out of sync,
+        * we do not need to round anything here */
+       sbnr = BM_SECT_TO_BIT(sector);
+       ebnr = BM_SECT_TO_BIT(esector);
+
+       /* ok, (capacity & 7) != 0 sometimes, but who cares...
+        * we count rs_{total,left} in bits, not sectors.  */
+       spin_lock_irqsave(&mdev->al_lock, flags);
+       count = drbd_bm_set_bits(mdev, sbnr, ebnr);
+
+       enr = BM_SECT_TO_EXT(sector);
+       e = lc_find(mdev->resync, enr);
+       if (e)
+               lc_entry(e, struct bm_extent, lce)->rs_left += count;
+       spin_unlock_irqrestore(&mdev->al_lock, flags);
+
+out:
+       put_ldev(mdev);
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
+{
+       struct lc_element *e;
+       struct bm_extent *bm_ext;
+       int wakeup = 0;
+       unsigned long rs_flags;
+
+       spin_lock_irq(&mdev->al_lock);
+       if (mdev->resync_locked > mdev->resync->nr_elements/2) {
+               spin_unlock_irq(&mdev->al_lock);
+               return NULL;
+       }
+       e = lc_get(mdev->resync, enr);
+       bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+       if (bm_ext) {
+               if (bm_ext->lce.lc_number != enr) {
+                       bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+                       bm_ext->rs_failed = 0;
+                       lc_changed(mdev->resync, &bm_ext->lce);
+                       wakeup = 1;
+               }
+               if (bm_ext->lce.refcnt == 1)
+                       mdev->resync_locked++;
+               set_bit(BME_NO_WRITES, &bm_ext->flags);
+       }
+       rs_flags = mdev->resync->flags;
+       spin_unlock_irq(&mdev->al_lock);
+       if (wakeup)
+               wake_up(&mdev->al_wait);
+
+       if (!bm_ext) {
+               if (rs_flags & LC_STARVING)
+                       dev_warn(DEV, "Have to wait for element"
+                            " (resync LRU too small?)\n");
+               BUG_ON(rs_flags & LC_DIRTY);
+       }
+
+       return bm_ext;
+}
+
+static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
+{
+       struct lc_element *al_ext;
+       int rv = 0;
+
+       spin_lock_irq(&mdev->al_lock);
+       if (unlikely(enr == mdev->act_log->new_number))
+               rv = 1;
+       else {
+               al_ext = lc_find(mdev->act_log, enr);
+               if (al_ext) {
+                       if (al_ext->refcnt)
+                               rv = 1;
+               }
+       }
+       spin_unlock_irq(&mdev->al_lock);
+
+       /*
+       if (unlikely(rv)) {
+               dev_info(DEV, "Delaying sync read until app's write is done\n");
+       }
+       */
+       return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @mdev:      DRBD device.
+ * @sector:    The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+       unsigned int enr = BM_SECT_TO_EXT(sector);
+       struct bm_extent *bm_ext;
+       int i, sig;
+
+       sig = wait_event_interruptible(mdev->al_wait,
+                       (bm_ext = _bme_get(mdev, enr)));
+       if (sig)
+               return 0;
+
+       if (test_bit(BME_LOCKED, &bm_ext->flags))
+               return 1;
+
+       for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+               sig = wait_event_interruptible(mdev->al_wait,
+                               !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
+               if (sig) {
+                       spin_lock_irq(&mdev->al_lock);
+                       if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+                               clear_bit(BME_NO_WRITES, &bm_ext->flags);
+                               mdev->resync_locked--;
+                               wake_up(&mdev->al_wait);
+                       }
+                       spin_unlock_irq(&mdev->al_lock);
+                       return 0;
+               }
+       }
+
+       set_bit(BME_LOCKED, &bm_ext->flags);
+
+       return 1;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @mdev:      DRBD device.
+ * @sector:    The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
+{
+       unsigned int enr = BM_SECT_TO_EXT(sector);
+       const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+       struct lc_element *e;
+       struct bm_extent *bm_ext;
+       int i;
+
+       spin_lock_irq(&mdev->al_lock);
+       if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
+               /* in case you have very heavy scattered io, it may
+                * stall the syncer undefined if we give up the ref count
+                * when we try again and requeue.
+                *
+                * if we don't give up the refcount, but the next time
+                * we are scheduled this extent has been "synced" by new
+                * application writes, we'd miss the lc_put on the
+                * extent we keep the refcount on.
+                * so we remembered which extent we had to try again, and
+                * if the next requested one is something else, we do
+                * the lc_put here...
+                * we also have to wake_up
+                */
+               e = lc_find(mdev->resync, mdev->resync_wenr);
+               bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+               if (bm_ext) {
+                       D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+                       D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+                       clear_bit(BME_NO_WRITES, &bm_ext->flags);
+                       mdev->resync_wenr = LC_FREE;
+                       if (lc_put(mdev->resync, &bm_ext->lce) == 0)
+                               mdev->resync_locked--;
+                       wake_up(&mdev->al_wait);
+               } else {
+                       dev_alert(DEV, "LOGIC BUG\n");
+               }
+       }
+       /* TRY. */
+       e = lc_try_get(mdev->resync, enr);
+       bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+       if (bm_ext) {
+               if (test_bit(BME_LOCKED, &bm_ext->flags))
+                       goto proceed;
+               if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+                       mdev->resync_locked++;
+               } else {
+                       /* we did set the BME_NO_WRITES,
+                        * but then could not set BME_LOCKED,
+                        * so we tried again.
+                        * drop the extra reference. */
+                       bm_ext->lce.refcnt--;
+                       D_ASSERT(bm_ext->lce.refcnt > 0);
+               }
+               goto check_al;
+       } else {
+               /* do we rather want to try later? */
+               if (mdev->resync_locked > mdev->resync->nr_elements-3)
+                       goto try_again;
+               /* Do or do not. There is no try. -- Yoda */
+               e = lc_get(mdev->resync, enr);
+               bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+               if (!bm_ext) {
+                       const unsigned long rs_flags = mdev->resync->flags;
+                       if (rs_flags & LC_STARVING)
+                               dev_warn(DEV, "Have to wait for element"
+                                    " (resync LRU too small?)\n");
+                       BUG_ON(rs_flags & LC_DIRTY);
+                       goto try_again;
+               }
+               if (bm_ext->lce.lc_number != enr) {
+                       bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
+                       bm_ext->rs_failed = 0;
+                       lc_changed(mdev->resync, &bm_ext->lce);
+                       wake_up(&mdev->al_wait);
+                       D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+               }
+               set_bit(BME_NO_WRITES, &bm_ext->flags);
+               D_ASSERT(bm_ext->lce.refcnt == 1);
+               mdev->resync_locked++;
+               goto check_al;
+       }
+check_al:
+       for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+               if (unlikely(al_enr+i == mdev->act_log->new_number))
+                       goto try_again;
+               if (lc_is_used(mdev->act_log, al_enr+i))
+                       goto try_again;
+       }
+       set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+       mdev->resync_wenr = LC_FREE;
+       spin_unlock_irq(&mdev->al_lock);
+       return 0;
+
+try_again:
+       if (bm_ext)
+               mdev->resync_wenr = enr;
+       spin_unlock_irq(&mdev->al_lock);
+       return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
+{
+       unsigned int enr = BM_SECT_TO_EXT(sector);
+       struct lc_element *e;
+       struct bm_extent *bm_ext;
+       unsigned long flags;
+
+       spin_lock_irqsave(&mdev->al_lock, flags);
+       e = lc_find(mdev->resync, enr);
+       bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+       if (!bm_ext) {
+               spin_unlock_irqrestore(&mdev->al_lock, flags);
+               if (__ratelimit(&drbd_ratelimit_state))
+                       dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
+               return;
+       }
+
+       if (bm_ext->lce.refcnt == 0) {
+               spin_unlock_irqrestore(&mdev->al_lock, flags);
+               dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
+                   "but refcnt is 0!?\n",
+                   (unsigned long long)sector, enr);
+               return;
+       }
+
+       if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
+               clear_bit(BME_LOCKED, &bm_ext->flags);
+               clear_bit(BME_NO_WRITES, &bm_ext->flags);
+               mdev->resync_locked--;
+               wake_up(&mdev->al_wait);
+       }
+
+       spin_unlock_irqrestore(&mdev->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @mdev:      DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_conf *mdev)
+{
+       spin_lock_irq(&mdev->al_lock);
+
+       if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
+               lc_reset(mdev->resync);
+               put_ldev(mdev);
+       }
+       mdev->resync_locked = 0;
+       mdev->resync_wenr = LC_FREE;
+       spin_unlock_irq(&mdev->al_lock);
+       wake_up(&mdev->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @mdev:      DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_conf *mdev)
+{
+       struct lc_element *e;
+       struct bm_extent *bm_ext;
+       int i;
+
+       spin_lock_irq(&mdev->al_lock);
+
+       if (get_ldev_if_state(mdev, D_FAILED)) {
+               /* ok, ->resync is there. */
+               for (i = 0; i < mdev->resync->nr_elements; i++) {
+                       e = lc_element_by_index(mdev->resync, i);
+                       bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+                       if (bm_ext->lce.lc_number == LC_FREE)
+                               continue;
+                       if (bm_ext->lce.lc_number == mdev->resync_wenr) {
+                               dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
+                                    " got 'synced' by application io\n",
+                                    mdev->resync_wenr);
+                               D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+                               D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
+                               clear_bit(BME_NO_WRITES, &bm_ext->flags);
+                               mdev->resync_wenr = LC_FREE;
+                               lc_put(mdev->resync, &bm_ext->lce);
+                       }
+                       if (bm_ext->lce.refcnt != 0) {
+                               dev_info(DEV, "Retrying drbd_rs_del_all() later. "
+                                    "refcnt=%d\n", bm_ext->lce.refcnt);
+                               put_ldev(mdev);
+                               spin_unlock_irq(&mdev->al_lock);
+                               return -EAGAIN;
+                       }
+                       D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
+                       D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
+                       lc_del(mdev->resync, &bm_ext->lce);
+               }
+               D_ASSERT(mdev->resync->used == 0);
+               put_ldev(mdev);
+       }
+       spin_unlock_irq(&mdev->al_lock);
+
+       return 0;
+}
+
+/**
+ * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
+ * @mdev:      DRBD device.
+ * @sector:    The sector number.
+ * @size:      Size of failed IO operation, in byte.
+ */
+void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
+{
+       /* Is called from worker and receiver context _only_ */
+       unsigned long sbnr, ebnr, lbnr;
+       unsigned long count;
+       sector_t esector, nr_sectors;
+       int wake_up = 0;
+
+       if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
+               dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
+                               (unsigned long long)sector, size);
+               return;
+       }
+       nr_sectors = drbd_get_capacity(mdev->this_bdev);
+       esector = sector + (size >> 9) - 1;
+
+       ERR_IF(sector >= nr_sectors) return;
+       ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+       lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+       /*
+        * round up start sector, round down end sector.  we make sure we only
+        * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
+       if (unlikely(esector < BM_SECT_PER_BIT-1))
+               return;
+       if (unlikely(esector == (nr_sectors-1)))
+               ebnr = lbnr;
+       else
+               ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+       sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+       if (sbnr > ebnr)
+               return;
+
+       /*
+        * ok, (capacity & 7) != 0 sometimes, but who cares...
+        * we count rs_{total,left} in bits, not sectors.
+        */
+       spin_lock_irq(&mdev->al_lock);
+       count = drbd_bm_count_bits(mdev, sbnr, ebnr);
+       if (count) {
+               mdev->rs_failed += count;
+
+               if (get_ldev(mdev)) {
+                       drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
+                       put_ldev(mdev);
+               }
+
+               /* just wake_up unconditional now, various lc_chaged(),
+                * lc_put() in drbd_try_clear_on_disk_bm(). */
+               wake_up = 1;
+       }
+       spin_unlock_irq(&mdev->al_lock);
+       if (wake_up)
+               wake_up(&mdev->al_wait);
+}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644 (file)
index 0000000..b61057e
--- /dev/null
@@ -0,0 +1,1327 @@
+/*
+   drbd_bitmap.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <asm/kmap_types.h>
+#include "drbd_int.h"
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name      bm_... => internal to implementation, "private".
+
+ * Note that since find_first_bit returns int, at the current granularity of
+ * the bitmap (4KB per byte), this implementation "only" supports up to
+ * 1<<(32+12) == 16 TB...
+ */
+
+/*
+ * NOTE
+ *  Access to the *bm_pages is protected by bm_lock.
+ *  It is safe to read the other members within the lock.
+ *
+ *  drbd_bm_set_bits is called from bio_endio callbacks,
+ *  We may be called with irq already disabled,
+ *  so we need spin_lock_irqsave().
+ *  And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+       struct page **bm_pages;
+       spinlock_t bm_lock;
+       /* WARNING unsigned long bm_*:
+        * 32bit number of bit offset is just enough for 512 MB bitmap.
+        * it will blow up if we make the bitmap bigger...
+        * not that it makes much sense to have a bitmap that large,
+        * rather change the granularity to 16k or 64k or something.
+        * (that implies other problems, however...)
+        */
+       unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
+       unsigned long bm_bits;
+       size_t   bm_words;
+       size_t   bm_number_of_pages;
+       sector_t bm_dev_capacity;
+       struct semaphore bm_change; /* serializes resize operations */
+
+       atomic_t bm_async_io;
+       wait_queue_head_t bm_io_wait;
+
+       unsigned long  bm_flags;
+
+       /* debugging aid, in case we are still racy somewhere */
+       char          *bm_why;