Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
Linus Torvalds [Thu, 21 Oct 2010 19:54:49 +0000 (12:54 -0700)]
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (163 commits)
  tracing: Fix compile issue for trace_sched_wakeup.c
  [S390] hardirq: remove pointless header file includes
  [IA64] Move local_softirq_pending() definition
  perf, powerpc: Fix power_pmu_event_init to not use event->ctx
  ftrace: Remove recursion between recordmcount and scripts/mod/empty
  jump_label: Add COND_STMT(), reducer wrappery
  perf: Optimize sw events
  perf: Use jump_labels to optimize the scheduler hooks
  jump_label: Add atomic_t interface
  jump_label: Use more consistent naming
  perf, hw_breakpoint: Fix crash in hw_breakpoint creation
  perf: Find task before event alloc
  perf: Fix task refcount bugs
  perf: Fix group moving
  irq_work: Add generic hardirq context callbacks
  perf_events: Fix transaction recovery in group_sched_in()
  perf_events: Fix bogus AMD64 generic TLB events
  perf_events: Fix bogus context time tracking
  tracing: Remove parent recording in latency tracer graph options
  tracing: Use one prologue for the preempt irqs off tracer function tracers
  ...

460 files changed:
Documentation/DocBook/kernel-locking.tmpl
Documentation/RCU/checklist.txt
Documentation/RCU/stallwarn.txt
Documentation/RCU/trace.txt
Documentation/networking/e1000.txt
Documentation/networking/e1000e.txt [new file with mode: 0644]
Documentation/networking/ixgbevf.txt [changed mode: 0755->0644]
Documentation/vm/page-types.c
MAINTAINERS
Makefile
arch/arm/Kconfig
arch/arm/kernel/kprobes-decode.c
arch/arm/mach-at91/include/mach/system.h
arch/arm/mach-bcmring/dma.c
arch/arm/mach-ep93xx/dma-m2p.c
arch/arm/mach-imx/Kconfig
arch/arm/mach-imx/mach-cpuimx27.c
arch/arm/mach-s5p6440/cpu.c
arch/arm/mach-s5p6442/cpu.c
arch/arm/mach-s5pc100/cpu.c
arch/arm/mach-s5pv210/clock.c
arch/arm/mach-s5pv210/cpu.c
arch/arm/mach-vexpress/ct-ca9x4.c
arch/arm/mach-vexpress/v2m.c
arch/arm/mm/ioremap.c
arch/arm/mm/mmu.c
arch/arm/mm/proc-v7.S
arch/arm/plat-omap/iommu.c
arch/arm/plat-samsung/adc.c
arch/arm/plat-samsung/clock.c
arch/m32r/include/asm/elf.h
arch/m32r/kernel/.gitignore [new file with mode: 0644]
arch/m32r/kernel/signal.c
arch/mips/Kbuild
arch/mips/Kconfig
arch/mips/boot/compressed/Makefile
arch/mips/dec/Platform
arch/mips/include/asm/fcntl.h
arch/mips/include/asm/siginfo.h
arch/mips/jz4740/Platform
arch/mips/kernel/branch.c
arch/mips/kernel/mips-mt-fpaff.c
arch/mips/kernel/ptrace.c
arch/mips/kernel/scall32-o32.S
arch/mips/kernel/scall64-64.S
arch/mips/kernel/scall64-n32.S
arch/mips/kernel/scall64-o32.S
arch/mips/kernel/signal.c
arch/mips/kernel/signal_n32.c
arch/mips/kernel/unaligned.c
arch/um/drivers/hostaudio_kern.c
arch/um/drivers/ubd_kern.c
arch/x86/ia32/ia32_aout.c
arch/x86/include/asm/amd_iommu.h
arch/x86/include/asm/amd_iommu_proto.h
arch/x86/include/asm/amd_iommu_types.h
arch/x86/include/asm/gart.h
arch/x86/include/asm/kvm_host.h
arch/x86/kernel/amd_iommu.c
arch/x86/kernel/amd_iommu_init.c
arch/x86/kernel/aperture_64.c
arch/x86/kernel/cpu/mcheck/mce_amd.c
arch/x86/kernel/cpu/mcheck/therm_throt.c
arch/x86/kernel/pci-gart_64.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/mm/srat_64.c
block/bsg.c
block/elevator.c
drivers/acpi/blacklist.c
drivers/acpi/processor_core.c
drivers/atm/iphase.c
drivers/atm/iphase.h
drivers/atm/solos-pci.c
drivers/block/Kconfig
drivers/block/Makefile
drivers/block/ps3disk.c
drivers/block/rbd.c [new file with mode: 0644]
drivers/block/rbd_types.h [new file with mode: 0644]
drivers/block/virtio_blk.c
drivers/char/agp/amd64-agp.c
drivers/char/agp/generic.c
drivers/char/tpm/tpm.c
drivers/char/virtio_console.c
drivers/dma/ioat/dma_v2.c
drivers/firewire/ohci.c
drivers/firewire/ohci.h
drivers/gpu/drm/i915/i915_dma.c
drivers/gpu/drm/i915/intel_fb.c
drivers/gpu/drm/nouveau/nouveau_fbcon.c
drivers/gpu/drm/nouveau/nouveau_notifier.c
drivers/gpu/drm/radeon/evergreen.c
drivers/gpu/drm/radeon/r100.c
drivers/gpu/drm/radeon/r600.c
drivers/gpu/drm/radeon/r600_blit_kms.c
drivers/gpu/drm/radeon/radeon.h
drivers/gpu/drm/radeon/radeon_atombios.c
drivers/gpu/drm/radeon/radeon_combios.c
drivers/gpu/drm/radeon/radeon_cursor.c
drivers/gpu/drm/radeon/radeon_fb.c
drivers/gpu/drm/radeon/radeon_object.c
drivers/gpu/drm/radeon/radeon_object.h
drivers/gpu/drm/radeon/rs600.c
drivers/gpu/drm/radeon/rs690.c
drivers/gpu/drm/radeon/rv770.c
drivers/gpu/drm/ttm/ttm_bo.c
drivers/hid/hid-cando.c
drivers/hid/hid-core.c
drivers/hid/hid-ids.h
drivers/hid/hidraw.c
drivers/hid/usbhid/hid-quirks.c
drivers/i2c/busses/i2c-cpm.c
drivers/i2c/busses/i2c-davinci.c
drivers/i2c/busses/i2c-ibm_iic.c
drivers/i2c/busses/i2c-imx.c
drivers/i2c/busses/i2c-mpc.c
drivers/i2c/busses/i2c-pca-isa.c
drivers/i2c/busses/i2c-pca-platform.c
drivers/i2c/i2c-core.c
drivers/idle/intel_idle.c
drivers/input/evdev.c
drivers/input/joydev.c
drivers/input/misc/hp_sdc_rtc.c
drivers/input/misc/uinput.c
drivers/input/serio/hil_mlc.c
drivers/input/serio/hp_sdc.c
drivers/input/tablet/wacom_sys.c
drivers/input/tablet/wacom_wac.c
drivers/isdn/sc/interrupt.c
drivers/macintosh/adb.c
drivers/md/bitmap.c
drivers/md/raid1.c
drivers/media/IR/ir-keytable.c
drivers/media/IR/ir-lirc-codec.c
drivers/media/IR/ir-raw-event.c
drivers/media/IR/ir-sysfs.c
drivers/media/IR/keymaps/rc-rc6-mce.c
drivers/media/IR/mceusb.c
drivers/media/dvb/dvb-usb/dib0700_core.c
drivers/media/dvb/dvb-usb/dib0700_devices.c
drivers/media/dvb/dvb-usb/opera1.c
drivers/media/dvb/frontends/dib7000p.c
drivers/media/dvb/frontends/dib7000p.h
drivers/media/dvb/siano/smscoreapi.c
drivers/media/radio/si470x/radio-si470x-i2c.c
drivers/media/video/cx231xx/Makefile
drivers/media/video/cx231xx/cx231xx-cards.c
drivers/media/video/cx25840/cx25840-core.c
drivers/media/video/cx88/Kconfig
drivers/media/video/gspca/gspca.c
drivers/media/video/gspca/sn9c20x.c
drivers/media/video/ivtv/ivtvfb.c
drivers/media/video/mem2mem_testdev.c
drivers/media/video/mt9m111.c
drivers/media/video/mt9v022.c
drivers/media/video/mx2_camera.c
drivers/media/video/pvrusb2/pvrusb2-ctrl.c
drivers/media/video/s5p-fimc/fimc-core.c
drivers/media/video/saa7134/saa7134-cards.c
drivers/media/video/saa7164/saa7164-buffer.c
drivers/media/video/uvc/uvc_driver.c
drivers/media/video/uvc/uvcvideo.h
drivers/media/video/v4l2-compat-ioctl32.c
drivers/media/video/videobuf-dma-contig.c
drivers/media/video/videobuf-dma-sg.c
drivers/misc/bh1780gli.c
drivers/mmc/core/core.c
drivers/mtd/nand/mxc_nand.c
drivers/net/3c527.c
drivers/net/Kconfig
drivers/net/b44.c
drivers/net/bonding/bond_main.c
drivers/net/ehea/ehea_main.c
drivers/net/ehea/ehea_qmr.h
drivers/net/fec.c
drivers/net/hamradio/6pack.c
drivers/net/hamradio/mkiss.c
drivers/net/irda/sir_dev.c
drivers/net/ppp_async.c
drivers/net/r8169.c
drivers/net/skge.c
drivers/net/tg3.c
drivers/net/tg3.h
drivers/net/wan/cosa.c
drivers/net/wimax/i2400m/rx.c
drivers/net/wireless/ath/ath9k/ani.c
drivers/parport/share.c
drivers/platform/x86/intel_ips.c
drivers/regulator/ad5398.c
drivers/regulator/isl6271a-regulator.c
drivers/rtc/rtc-ds3232.c
drivers/scsi/scsi.c
drivers/serial/ioc3_serial.c
drivers/staging/tm6000/Kconfig
drivers/staging/tm6000/tm6000-input.c
drivers/vhost/net.c
drivers/vhost/vhost.c
drivers/vhost/vhost.h
fs/affs/super.c
fs/binfmt_aout.c
fs/ceph/Kconfig
fs/ceph/Makefile
fs/ceph/README [deleted file]
fs/ceph/addr.c
fs/ceph/caps.c
fs/ceph/ceph_frag.c
fs/ceph/ceph_strings.c [deleted file]
fs/ceph/debugfs.c
fs/ceph/dir.c
fs/ceph/export.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/ioctl.c
fs/ceph/ioctl.h
fs/ceph/locks.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/mdsmap.c
fs/ceph/pagelist.c [deleted file]
fs/ceph/pagelist.h [deleted file]
fs/ceph/snap.c
fs/ceph/strings.c [new file with mode: 0644]
fs/ceph/super.c
fs/ceph/super.h
fs/ceph/xattr.c
fs/exec.c
fs/exofs/inode.c
fs/gfs2/Kconfig
fs/gfs2/aops.c
fs/gfs2/bmap.c
fs/gfs2/bmap.h
fs/gfs2/dentry.c
fs/gfs2/dir.c
fs/gfs2/dir.h
fs/gfs2/export.c
fs/gfs2/file.c
fs/gfs2/glock.c
fs/gfs2/glock.h
fs/gfs2/glops.c
fs/gfs2/incore.h
fs/gfs2/inode.c
fs/gfs2/inode.h
fs/gfs2/lock_dlm.c
fs/gfs2/main.c
fs/gfs2/ops_fstype.c
fs/gfs2/ops_inode.c
fs/gfs2/quota.c
fs/gfs2/recovery.c
fs/gfs2/rgrp.c
fs/gfs2/rgrp.h
fs/gfs2/super.c
fs/gfs2/sys.c
fs/gfs2/trace_gfs2.h
fs/gfs2/trans.h
fs/gfs2/xattr.c
fs/hfs/bfind.c
fs/hfs/btree.c
fs/hfs/btree.h
fs/hfsplus/bfind.c
fs/hfsplus/bitmap.c
fs/hfsplus/brec.c
fs/hfsplus/btree.c
fs/hfsplus/catalog.c
fs/hfsplus/dir.c
fs/hfsplus/extents.c
fs/hfsplus/hfsplus_fs.h
fs/hfsplus/hfsplus_raw.h
fs/hfsplus/inode.c
fs/hfsplus/ioctl.c
fs/hfsplus/options.c
fs/hfsplus/part_tbl.c
fs/hfsplus/super.c
fs/hfsplus/unicode.c
fs/hfsplus/wrapper.c
fs/nfsd/nfsfh.h
fs/notify/Kconfig
fs/xfs/linux-2.6/xfs_sync.c
include/drm/ttm/ttm_bo_api.h
include/linux/Kbuild
include/linux/ceph/auth.h [moved from fs/ceph/auth.h with 97% similarity]
include/linux/ceph/buffer.h [moved from fs/ceph/buffer.h with 100% similarity]
include/linux/ceph/ceph_debug.h [moved from fs/ceph/ceph_debug.h with 86% similarity]
include/linux/ceph/ceph_frag.h [moved from fs/ceph/ceph_frag.h with 100% similarity]
include/linux/ceph/ceph_fs.h [moved from fs/ceph/ceph_fs.h with 99% similarity]
include/linux/ceph/ceph_hash.h [moved from fs/ceph/ceph_hash.h with 100% similarity]
include/linux/ceph/debugfs.h [new file with mode: 0644]
include/linux/ceph/decode.h [moved from fs/ceph/decode.h with 96% similarity]
include/linux/ceph/libceph.h [new file with mode: 0644]
include/linux/ceph/mdsmap.h [moved from fs/ceph/mdsmap.h with 100% similarity]
include/linux/ceph/messenger.h [moved from fs/ceph/messenger.h with 95% similarity]
include/linux/ceph/mon_client.h [moved from fs/ceph/mon_client.h with 99% similarity]
include/linux/ceph/msgpool.h [moved from fs/ceph/msgpool.h with 100% similarity]
include/linux/ceph/msgr.h [moved from fs/ceph/msgr.h with 100% similarity]
include/linux/ceph/osd_client.h [moved from fs/ceph/osd_client.h with 76% similarity]
include/linux/ceph/osdmap.h [moved from fs/ceph/osdmap.h with 97% similarity]
include/linux/ceph/pagelist.h [new file with mode: 0644]
include/linux/ceph/rados.h [moved from fs/ceph/rados.h with 100% similarity]
include/linux/ceph/types.h [moved from fs/ceph/types.h with 100% similarity]
include/linux/cgroup.h
include/linux/compiler.h
include/linux/coredump.h
include/linux/cred.h
include/linux/crush/crush.h [moved from fs/ceph/crush/crush.h with 100% similarity]
include/linux/crush/hash.h [moved from fs/ceph/crush/hash.h with 100% similarity]
include/linux/crush/mapper.h [moved from fs/ceph/crush/mapper.h with 100% similarity]
include/linux/debug_locks.h
include/linux/elevator.h
include/linux/fdtable.h
include/linux/fs.h
include/linux/genhd.h
include/linux/hardirq.h
include/linux/idr.h
include/linux/init_task.h
include/linux/input.h
include/linux/iocontext.h
include/linux/kernel.h
include/linux/key.h
include/linux/kvm_host.h
include/linux/lockdep.h
include/linux/mm_types.h
include/linux/netfilter/nfnetlink_conntrack.h
include/linux/netfilter/xt_SECMARK.h
include/linux/nfs_fs.h
include/linux/notifier.h
include/linux/radix-tree.h
include/linux/rculist.h
include/linux/rculist_nulls.h
include/linux/rcupdate.h
include/linux/rcutiny.h
include/linux/rcutree.h
include/linux/sched.h
include/linux/security.h
include/linux/selinux.h
include/linux/srcu.h
include/linux/sunrpc/auth_gss.h
include/linux/types.h
include/media/videobuf-dma-sg.h
include/net/bluetooth/bluetooth.h
include/net/cls_cgroup.h
include/net/netfilter/nf_conntrack.h
init/Kconfig
kernel/Makefile
kernel/cgroup.c
kernel/cpuset.c
kernel/hrtimer.c
kernel/hung_task.c
kernel/lockdep.c
kernel/perf_event.c
kernel/pid.c
kernel/printk.c
kernel/rcupdate.c
kernel/rcutiny.c
kernel/rcutiny_plugin.h
kernel/rcutorture.c
kernel/rcutree.c
kernel/rcutree.h
kernel/rcutree_plugin.h
kernel/rcutree_trace.c
kernel/sched.c
kernel/sched_fair.c
kernel/signal.c
kernel/srcu.c
kernel/sysctl.c
kernel/sysctl_check.c
kernel/trace/ring_buffer.c
kernel/watchdog.c
lib/Kconfig.debug
lib/radix-tree.c
mm/memcontrol.c
mm/memory-failure.c
mm/page_alloc.c
net/Kconfig
net/Makefile
net/atm/mpc.c
net/bluetooth/l2cap.c
net/bluetooth/rfcomm/sock.c
net/caif/caif_socket.c
net/ceph/Kconfig [new file with mode: 0644]
net/ceph/Makefile [new file with mode: 0644]
net/ceph/armor.c [moved from fs/ceph/armor.c with 100% similarity]
net/ceph/auth.c [moved from fs/ceph/auth.c with 97% similarity]
net/ceph/auth_none.c [moved from fs/ceph/auth_none.c with 96% similarity]
net/ceph/auth_none.h [moved from fs/ceph/auth_none.h with 94% similarity]
net/ceph/auth_x.c [moved from fs/ceph/auth_x.c with 99% similarity]
net/ceph/auth_x.h [moved from fs/ceph/auth_x.h with 96% similarity]
net/ceph/auth_x_protocol.h [moved from fs/ceph/auth_x_protocol.h with 100% similarity]
net/ceph/buffer.c [moved from fs/ceph/buffer.c with 86% similarity]
net/ceph/ceph_common.c [new file with mode: 0644]
net/ceph/ceph_fs.c [moved from fs/ceph/ceph_fs.c with 92% similarity]
net/ceph/ceph_hash.c [moved from fs/ceph/ceph_hash.c with 98% similarity]
net/ceph/ceph_strings.c [new file with mode: 0644]
net/ceph/crush/crush.c [moved from fs/ceph/crush/crush.c with 99% similarity]
net/ceph/crush/hash.c [moved from fs/ceph/crush/hash.c with 99% similarity]
net/ceph/crush/mapper.c [moved from fs/ceph/crush/mapper.c with 99% similarity]
net/ceph/crypto.c [moved from fs/ceph/crypto.c with 99% similarity]
net/ceph/crypto.h [moved from fs/ceph/crypto.h with 95% similarity]
net/ceph/debugfs.c [new file with mode: 0644]
net/ceph/messenger.c [moved from fs/ceph/messenger.c with 89% similarity]
net/ceph/mon_client.c [moved from fs/ceph/mon_client.c with 94% similarity]
net/ceph/msgpool.c [moved from fs/ceph/msgpool.c with 95% similarity]
net/ceph/osd_client.c [moved from fs/ceph/osd_client.c with 84% similarity]
net/ceph/osdmap.c [moved from fs/ceph/osdmap.c with 97% similarity]
net/ceph/pagelist.c [new file with mode: 0644]
net/ceph/pagevec.c [new file with mode: 0644]
net/core/ethtool.c
net/core/sock.c
net/core/stream.c
net/ipv4/Kconfig
net/ipv4/igmp.c
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
net/ipv4/netfilter/nf_nat_core.c
net/ipv6/route.c
net/mac80211/agg-tx.c
net/mac80211/status.c
net/netfilter/core.c
net/netfilter/nf_conntrack_ecache.c
net/netfilter/nf_conntrack_extend.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_proto.c
net/netfilter/nf_conntrack_standalone.c
net/netfilter/nf_log.c
net/netfilter/nf_queue.c
net/netfilter/xt_CT.c
net/netfilter/xt_SECMARK.c
net/rds/page.c
net/sched/cls_u32.c
net/sctp/auth.c
net/sctp/socket.c
scripts/kconfig/conf.c
scripts/kconfig/expr.h
scripts/kconfig/menu.c
scripts/kconfig/symbol.c
security/apparmor/.gitignore
security/apparmor/apparmorfs.c
security/capability.c
security/commoncap.c
security/security.c
security/selinux/Makefile
security/selinux/exports.c
security/selinux/hooks.c
security/selinux/include/classmap.h
security/selinux/include/security.h
security/selinux/selinuxfs.c
security/selinux/ss/Makefile [deleted file]
security/selinux/ss/avtab.c
security/selinux/ss/avtab.h
security/selinux/ss/conditional.c
security/selinux/ss/conditional.h
security/selinux/ss/ebitmap.c
security/selinux/ss/ebitmap.h
security/selinux/ss/policydb.c
security/selinux/ss/policydb.h
security/selinux/ss/services.c
security/selinux/ss/status.c [new file with mode: 0644]
security/smack/smack_lsm.c
security/tomoyo/common.c
sound/core/rawmidi.c
sound/oss/soundcard.c
sound/pci/hda/patch_sigmatel.c
tools/perf/perf.h

index a0d479d..f66f4df 100644 (file)
@@ -1645,7 +1645,9 @@ the amount of locking which needs to be done.
       all the readers who were traversing the list when we deleted the
       element are finished.  We use <function>call_rcu()</function> to
       register a callback which will actually destroy the object once
-      the readers are finished.
+      all pre-existing readers are finished.  Alternatively,
+      <function>synchronize_rcu()</function> may be used to block until
+      all pre-existing are finished.
     </para>
     <para>
       But how does Read Copy Update know when the readers are
@@ -1714,7 +1716,7 @@ the amount of locking which needs to be done.
 -        object_put(obj);
 +        list_del_rcu(&amp;obj-&gt;list);
          cache_num--;
-+        call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu, obj);
++        call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu);
  }
 
  /* Must be holding cache_lock */
@@ -1725,14 +1727,6 @@ the amount of locking which needs to be done.
          if (++cache_num > MAX_CACHE_SIZE) {
                  struct object *i, *outcast = NULL;
                  list_for_each_entry(i, &amp;cache, list) {
-@@ -85,6 +94,7 @@
-         obj-&gt;popularity = 0;
-         atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
-         spin_lock_init(&amp;obj-&gt;lock);
-+        INIT_RCU_HEAD(&amp;obj-&gt;rcu);
-
-         spin_lock_irqsave(&amp;cache_lock, flags);
-         __cache_add(obj);
 @@ -104,12 +114,11 @@
  struct object *cache_find(int id)
  {
index 790d1a8..0c134f8 100644 (file)
@@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome!
        include:
 
        a.      Keeping a count of the number of data-structure elements
-               used by the RCU-protected data structure, including those
-               waiting for a grace period to elapse.  Enforce a limit
-               on this number, stalling updates as needed to allow
-               previously deferred frees to complete.
-
-               Alternatively, limit only the number awaiting deferred
-               free rather than the total number of elements.
+               used by the RCU-protected data structure, including
+               those waiting for a grace period to elapse.  Enforce a
+               limit on this number, stalling updates as needed to allow
+               previously deferred frees to complete.  Alternatively,
+               limit only the number awaiting deferred free rather than
+               the total number of elements.
+
+               One way to stall the updates is to acquire the update-side
+               mutex.  (Don't try this with a spinlock -- other CPUs
+               spinning on the lock could prevent the grace period
+               from ever ending.)  Another way to stall the updates
+               is for the updates to use a wrapper function around
+               the memory allocator, so that this wrapper function
+               simulates OOM when there is too much memory awaiting an
+               RCU grace period.  There are of course many other
+               variations on this theme.
 
        b.      Limiting update rate.  For example, if updates occur only
                once per hour, then no explicit rate limiting is required,
@@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome!
        and the compiler to freely reorder code into and out of RCU
        read-side critical sections.  It is the responsibility of the
        RCU update-side primitives to deal with this.
+
+17.    Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
+       the __rcu sparse checks to validate your RCU code.  These
+       can help find problems as follows:
+
+       CONFIG_PROVE_RCU: check that accesses to RCU-protected data
+               structures are carried out under the proper RCU
+               read-side critical section, while holding the right
+               combination of locks, or whatever other conditions
+               are appropriate.
+
+       CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
+               same object to call_rcu() (or friends) before an RCU
+               grace period has elapsed since the last time that you
+               passed that same object to call_rcu() (or friends).
+
+       __rcu sparse checks: tag the pointer to the RCU-protected data
+               structure with __rcu, and sparse will warn you if you
+               access that pointer without the services of one of the
+               variants of rcu_dereference().
+
+       These debugging aids can help you find problems that are
+       otherwise extremely difficult to spot.
index 44c6dcc..862c08e 100644 (file)
@@ -80,6 +80,24 @@ o    A CPU looping with bottom halves disabled.  This condition can
 o      For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
        without invoking schedule().
 
+o      A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+       happen to preempt a low-priority task in the middle of an RCU
+       read-side critical section.   This is especially damaging if
+       that low-priority task is not permitted to run on any other CPU,
+       in which case the next RCU grace period can never complete, which
+       will eventually cause the system to run out of memory and hang.
+       While the system is in the process of running itself out of
+       memory, you might see stall-warning messages.
+
+o      A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+       is running at a higher priority than the RCU softirq threads.
+       This will prevent RCU callbacks from ever being invoked,
+       and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent
+       RCU grace periods from ever completing.  Either way, the
+       system will eventually run out of memory and hang.  In the
+       CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
+       messages.
+
 o      A bug in the RCU implementation.
 
 o      A hardware failure.  This is quite unlikely, but has occurred
index efd8cc9..a851118 100644 (file)
@@ -125,6 +125,17 @@ o  "b" is the batch limit for this CPU.  If more than this number
        of RCU callbacks is ready to invoke, then the remainder will
        be deferred.
 
+o      "ci" is the number of RCU callbacks that have been invoked for
+       this CPU.  Note that ci+ql is the number of callbacks that have
+       been registered in absence of CPU-hotplug activity.
+
+o      "co" is the number of RCU callbacks that have been orphaned due to
+       this CPU going offline.
+
+o      "ca" is the number of RCU callbacks that have been adopted due to
+       other CPUs going offline.  Note that ci+co-ca+ql is the number of
+       RCU callbacks registered on this CPU.
+
 There is also an rcu/rcudata.csv file with the same information in
 comma-separated-variable spreadsheet format.
 
@@ -180,7 +191,7 @@ o   "s" is the "signaled" state that drives force_quiescent_state()'s
 
 o      "jfq" is the number of jiffies remaining for this grace period
        before force_quiescent_state() is invoked to help push things
-       along.  Note that CPUs in dyntick-idle mode thoughout the grace
+       along.  Note that CPUs in dyntick-idle mode throughout the grace
        period will not report on their own, but rather must be check by
        some other CPU via force_quiescent_state().
 
index 2df7186..d9271e7 100644 (file)
@@ -1,82 +1,35 @@
 Linux* Base Driver for the Intel(R) PRO/1000 Family of Adapters
 ===============================================================
 
-September 26, 2006
-
+Intel Gigabit Linux driver.
+Copyright(c) 1999 - 2010 Intel Corporation.
 
 Contents
 ========
 
-- In This Release
 - Identifying Your Adapter
-- Building and Installation
 - Command Line Parameters
 - Speed and Duplex Configuration
 - Additional Configurations
-- Known Issues
 - Support
 
-
-In This Release
-===============
-
-This file describes the Linux* Base Driver for the Intel(R) PRO/1000 Family
-of Adapters.  This driver includes support for Itanium(R)2-based systems.
-
-For questions related to hardware requirements, refer to the documentation
-supplied with your Intel PRO/1000 adapter. All hardware requirements listed
-apply to use with Linux.
-
-The following features are now available in supported kernels:
- - Native VLANs
- - Channel Bonding (teaming)
- - SNMP
-
-Channel Bonding documentation can be found in the Linux kernel source:
-/Documentation/networking/bonding.txt
-
-The driver information previously displayed in the /proc filesystem is not
-supported in this release.  Alternatively, you can use ethtool (version 1.6
-or later), lspci, and ifconfig to obtain the same information.
-
-Instructions on updating ethtool can be found in the section "Additional
-Configurations" later in this document.
-
-NOTE: The Intel(R) 82562v 10/100 Network Connection only provides 10/100
-support.
-
-
 Identifying Your Adapter
 ========================
 
 For more information on how to identify your adapter, go to the Adapter &
 Driver ID Guide at:
 
-    http://support.intel.com/support/network/adapter/pro100/21397.htm
+    http://support.intel.com/support/go/network/adapter/idguide.htm
 
 For the latest Intel network drivers for Linux, refer to the following
 website.  In the search field, enter your adapter name or type, or use the
 networking link on the left to search for your adapter:
 
-    http://downloadfinder.intel.com/scripts-df/support_intel.asp
-
+    http://support.intel.com/support/go/network/adapter/home.htm
 
 Command Line Parameters
 =======================
 
-If the driver is built as a module, the  following optional parameters
-are used by entering them on the command line with the modprobe command
-using this syntax:
-
-     modprobe e1000 [<option>=<VAL1>,<VAL2>,...]
-
-For example, with two PRO/1000 PCI adapters, entering:
-
-     modprobe e1000 TxDescriptors=80,128
-
-loads the e1000 driver with 80 TX descriptors for the first adapter and
-128 TX descriptors for the second adapter.
-
 The default value for each parameter is generally the recommended setting,
 unless otherwise noted.
 
@@ -89,10 +42,6 @@ NOTES:  For more information about the AutoNeg, Duplex, and Speed
         parameters, see the application note at:
         http://www.intel.com/design/network/applnots/ap450.htm
 
-        A descriptor describes a data buffer and attributes related to
-        the data buffer.  This information is accessed by the hardware.
-
-
 AutoNeg
 -------
 (Supported only on adapters with copper connections)
@@ -106,7 +55,6 @@ Duplex parameters must not be specified.
 NOTE:  Refer to the Speed and Duplex section of this readme for more
        information on the AutoNeg parameter.
 
-
 Duplex
 ------
 (Supported only on adapters with copper connections)
@@ -119,7 +67,6 @@ set to auto-negotiate, the board auto-detects the correct duplex.  If the
 link partner is forced (either full or half), Duplex defaults to half-
 duplex.
 
-
 FlowControl
 -----------
 Valid Range:   0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
@@ -128,16 +75,16 @@ Default Value: Reads flow control settings from the EEPROM
 This parameter controls the automatic generation(Tx) and response(Rx)
 to Ethernet PAUSE frames.
 
-
 InterruptThrottleRate
 ---------------------
 (not supported on Intel(R) 82542, 82543 or 82544-based adapters)
-Valid Range:   0,1,3,100-100000 (0=off, 1=dynamic, 3=dynamic conservative)
+Valid Range:   0,1,3,4,100-100000 (0=off, 1=dynamic, 3=dynamic conservative,
+                                   4=simplified balancing)
 Default Value: 3
 
 The driver can limit the amount of interrupts per second that the adapter
-will generate for incoming packets. It does this by writing a value to the 
-adapter that is based on the maximum amount of interrupts that the adapter 
+will generate for incoming packets. It does this by writing a value to the
+adapter that is based on the maximum amount of interrupts that the adapter
 will generate per second.
 
 Setting InterruptThrottleRate to a value greater or equal to 100
@@ -146,37 +93,43 @@ per second, even if more packets have come in. This reduces interrupt
 load on the system and can lower CPU utilization under heavy load,
 but will increase latency as packets are not processed as quickly.
 
-The default behaviour of the driver previously assumed a static 
-InterruptThrottleRate value of 8000, providing a good fallback value for 
-all traffic types,but lacking in small packet performance and latency. 
-The hardware can handle many more small packets per second however, and 
+The default behaviour of the driver previously assumed a static
+InterruptThrottleRate value of 8000, providing a good fallback value for
+all traffic types,but lacking in small packet performance and latency.
+The hardware can handle many more small packets per second however, and
 for this reason an adaptive interrupt moderation algorithm was implemented.
 
 Since 7.3.x, the driver has two adaptive modes (setting 1 or 3) in which
-it dynamically adjusts the InterruptThrottleRate value based on the traffic 
+it dynamically adjusts the InterruptThrottleRate value based on the traffic
 that it receives. After determining the type of incoming traffic in the last
-timeframe, it will adjust the InterruptThrottleRate to an appropriate value 
+timeframe, it will adjust the InterruptThrottleRate to an appropriate value
 for that traffic.
 
 The algorithm classifies the incoming traffic every interval into
-classes.  Once the class is determined, the InterruptThrottleRate value is 
-adjusted to suit that traffic type the best. There are three classes defined: 
+classes.  Once the class is determined, the InterruptThrottleRate value is
+adjusted to suit that traffic type the best. There are three classes defined:
 "Bulk traffic", for large amounts of packets of normal size; "Low latency",
 for small amounts of traffic and/or a significant percentage of small
-packets; and "Lowest latency", for almost completely small packets or 
+packets; and "Lowest latency", for almost completely small packets or
 minimal traffic.
 
-In dynamic conservative mode, the InterruptThrottleRate value is set to 4000 
-for traffic that falls in class "Bulk traffic". If traffic falls in the "Low 
-latency" or "Lowest latency" class, the InterruptThrottleRate is increased 
+In dynamic conservative mode, the InterruptThrottleRate value is set to 4000
+for traffic that falls in class "Bulk traffic". If traffic falls in the "Low
+latency" or "Lowest latency" class, the InterruptThrottleRate is increased
 stepwise to 20000. This default mode is suitable for most applications.
 
 For situations where low latency is vital such as cluster or
 grid computing, the algorithm can reduce latency even more when
 InterruptThrottleRate is set to mode 1. In this mode, which operates
-the same as mode 3, the InterruptThrottleRate will be increased stepwise to 
+the same as mode 3, the InterruptThrottleRate will be increased stepwise to
 70000 for traffic in class "Lowest latency".
 
+In simplified mode the interrupt rate is based on the ratio of Tx and
+Rx traffic.  If the bytes per second rate is approximately equal, the
+interrupt rate will drop as low as 2000 interrupts per second.  If the
+traffic is mostly transmit or mostly receive, the interrupt rate could
+be as high as 8000.
+
 Setting InterruptThrottleRate to 0 turns off any interrupt moderation
 and may improve small packet latency, but is generally not suitable
 for bulk throughput traffic.
@@ -212,8 +165,6 @@ NOTE:  When e1000 is loaded with default settings and multiple adapters
        be platform-specific.  If CPU utilization is not a concern, use
        RX_POLLING (NAPI) and default driver settings.
 
-
-
 RxDescriptors
 -------------
 Valid Range:   80-256 for 82542 and 82543-based adapters
@@ -225,15 +176,14 @@ by the driver.  Increasing this value allows the driver to buffer more
 incoming packets, at the expense of increased system memory utilization.
 
 Each descriptor is 16 bytes.  A receive buffer is also allocated for each
-descriptor and can be either 2048, 4096, 8192, or 16384 bytes, depending 
+descriptor and can be either 2048, 4096, 8192, or 16384 bytes, depending
 on the MTU setting. The maximum MTU size is 16110.
 
-NOTE:  MTU designates the frame size.  It only needs to be set for Jumbo 
-       Frames.  Depending on the available system resources, the request 
-       for a higher number of receive descriptors may be denied.  In this 
+NOTE:  MTU designates the frame size.  It only needs to be set for Jumbo
+       Frames.  Depending on the available system resources, the request
+       for a higher number of receive descriptors may be denied.  In this
        case, use a lower number.
 
-
 RxIntDelay
 ----------
 Valid Range:   0-65535 (0=off)
@@ -254,7 +204,6 @@ CAUTION:  When setting RxIntDelay to a value other than 0, adapters may
           restoring the network connection.  To eliminate the potential
           for the hang ensure that RxIntDelay is set to 0.
 
-
 RxAbsIntDelay
 -------------
 (This parameter is supported only on 82540, 82545 and later adapters.)
@@ -268,7 +217,6 @@ packet is received within the set amount of time.  Proper tuning,
 along with RxIntDelay, may improve traffic throughput in specific network
 conditions.
 
-
 Speed
 -----
 (This parameter is supported only on adapters with copper connections.)
@@ -280,7 +228,6 @@ Speed forces the line speed to the specified value in megabits per second
 partner is set to auto-negotiate, the board will auto-detect the correct
 speed.  Duplex should also be set when Speed is set to either 10 or 100.
 
-
 TxDescriptors
 -------------
 Valid Range:   80-256 for 82542 and 82543-based adapters
@@ -295,6 +242,36 @@ NOTE:  Depending on the available system resources, the request for a
        higher number of transmit descriptors may be denied.  In this case,
        use a lower number.
 
+TxDescriptorStep
+----------------
+Valid Range:    1 (use every Tx Descriptor)
+               4 (use every 4th Tx Descriptor)
+
+Default Value:  1 (use every Tx Descriptor)
+
+On certain non-Intel architectures, it has been observed that intense TX
+traffic bursts of short packets may result in an improper descriptor
+writeback. If this occurs, the driver will report a "TX Timeout" and reset
+the adapter, after which the transmit flow will restart, though data may
+have stalled for as much as 10 seconds before it resumes.
+
+The improper writeback does not occur on the first descriptor in a system
+memory cache-line, which is typically 32 bytes, or 4 descriptors long.
+
+Setting TxDescriptorStep to a value of 4 will ensure that all TX descriptors
+are aligned to the start of a system memory cache line, and so this problem
+will not occur.
+
+NOTES: Setting TxDescriptorStep to 4 effectively reduces the number of
+       TxDescriptors available for transmits to 1/4 of the normal allocation.
+       This has a possible negative performance impact, which may be
+       compensated for by allocating more descriptors using the TxDescriptors
+       module parameter.
+
+       There are other conditions which may result in "TX Timeout", which will
+       not be resolved by the use of the TxDescriptorStep parameter. As the
+       issue addressed by this parameter has never been observed on Intel
+       Architecture platforms, it should not be used on Intel platforms.
 
 TxIntDelay
 ----------
@@ -307,7 +284,6 @@ efficiency if properly tuned for specific network traffic.  If the
 system is reporting dropped transmits, this value may be set too high
 causing the driver to run out of available transmit descriptors.
 
-
 TxAbsIntDelay
 -------------
 (This parameter is supported only on 82540, 82545 and later adapters.)
@@ -330,6 +306,35 @@ Default Value: 1
 A value of '1' indicates that the driver should enable IP checksum
 offload for received packets (both UDP and TCP) to the adapter hardware.
 
+Copybreak
+---------
+Valid Range:   0-xxxxxxx (0=off)
+Default Value: 256
+Usage: insmod e1000.ko copybreak=128
+
+Driver copies all packets below or equaling this size to a fresh Rx
+buffer before handing it up the stack.
+
+This parameter is different than other parameters, in that it is a
+single (not 1,1,1 etc.) parameter applied to all driver instances and
+it is also available during runtime at
+/sys/module/e1000/parameters/copybreak
+
+SmartPowerDownEnable
+--------------------
+Valid Range: 0-1
+Default Value:  0 (disabled)
+
+Allows PHY to turn off in lower power states. The user can turn off
+this parameter in supported chipsets.
+
+KumeranLockLoss
+---------------
+Valid Range: 0-1
+Default Value: 1 (enabled)
+
+This workaround skips resetting the PHY at shutdown for the initial
+silicon releases of ICH8 systems.
 
 Speed and Duplex Configuration
 ==============================
@@ -385,40 +390,9 @@ If the link partner is forced to a specific speed and duplex, then this
 parameter should not be used.  Instead, use the Speed and Duplex parameters
 previously mentioned to force the adapter to the same speed and duplex.
 
-
 Additional Configurations
 =========================
 
-  Configuring the Driver on Different Distributions
-  -------------------------------------------------
-  Configuring a network driver to load properly when the system is started
-  is distribution dependent.  Typically, the configuration process involves
-  adding an alias line to /etc/modules.conf or /etc/modprobe.conf as well
-  as editing other system startup scripts and/or configuration files.  Many
-  popular Linux distributions ship with tools to make these changes for you.
-  To learn the proper way to configure a network device for your system,
-  refer to your distribution documentation.  If during this process you are
-  asked for the driver or module name, the name for the Linux Base Driver
-  for the Intel(R) PRO/1000 Family of Adapters is e1000.
-
-  As an example, if you install the e1000 driver for two PRO/1000 adapters
-  (eth0 and eth1) and set the speed and duplex to 10full and 100half, add
-  the following to modules.conf or or modprobe.conf:
-
-       alias eth0 e1000
-       alias eth1 e1000
-       options e1000 Speed=10,100 Duplex=2,1
-
-  Viewing Link Messages
-  ---------------------
-  Link messages will not be displayed to the console if the distribution is
-  restricting system messages.  In order to see network driver link messages
-  on your console, set dmesg to eight by entering the following:
-
-       dmesg -n 8
-
-  NOTE: This setting is not saved across reboots.
-
   Jumbo Frames
   ------------
   Jumbo Frames support is enabled by changing the MTU to a value larger than
@@ -437,9 +411,11 @@ Additional Configurations
    setting in a different location.
 
   Notes:
-
-  - To enable Jumbo Frames, increase the MTU size on the interface beyond
-    1500.
+  Degradation in throughput performance may be observed in some Jumbo frames
+  environments. If this is observed, increasing the application's socket buffer
+  size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values may help.
+  See the specific application manual and /usr/src/linux*/Documentation/
+  networking/ip-sysctl.txt for more details.
 
   - The maximum MTU setting for Jumbo Frames is 16110.  This value coincides
     with the maximum Jumbo Frames size of 16128.
@@ -447,40 +423,11 @@ Additional Configurations
   - Using Jumbo Frames at 10 or 100 Mbps may result in poor performance or
     loss of link.
 
-  - Some Intel gigabit adapters that support Jumbo Frames have a frame size
-    limit of 9238 bytes, with a corresponding MTU size limit of 9216 bytes.
-    The adapters with this limitation are based on the Intel(R) 82571EB,
-    82572EI, 82573L and 80003ES2LAN controller.  These correspond to the
-    following product names:
-     Intel(R) PRO/1000 PT Server Adapter
-     Intel(R) PRO/1000 PT Desktop Adapter
-     Intel(R) PRO/1000 PT Network Connection
-     Intel(R) PRO/1000 PT Dual Port Server Adapter
-     Intel(R) PRO/1000 PT Dual Port Network Connection
-     Intel(R) PRO/1000 PF Server Adapter
-     Intel(R) PRO/1000 PF Network Connection
-     Intel(R) PRO/1000 PF Dual Port Server Adapter
-     Intel(R) PRO/1000 PB Server Connection
-     Intel(R) PRO/1000 PL Network Connection
-     Intel(R) PRO/1000 EB Network Connection with I/O Acceleration
-     Intel(R) PRO/1000 EB Backplane Connection with I/O Acceleration
-     Intel(R) PRO/1000 PT Quad Port Server Adapter
-
   - Adapters based on the Intel(R) 82542 and 82573V/E controller do not
     support Jumbo Frames. These correspond to the following product names:
      Intel(R) PRO/1000 Gigabit Server Adapter
      Intel(R) PRO/1000 PM Network Connection
 
-  - The following adapters do not support Jumbo Frames:
-     Intel(R) 82562V 10/100 Network Connection
-     Intel(R) 82566DM Gigabit Network Connection
-     Intel(R) 82566DC Gigabit Network Connection
-     Intel(R) 82566MM Gigabit Network Connection
-     Intel(R) 82566MC Gigabit Network Connection
-     Intel(R) 82562GT 10/100 Network Connection
-     Intel(R) 82562G 10/100 Network Connection
-
-
   Ethtool
   -------
   The driver utilizes the ethtool interface for driver configuration and
@@ -490,142 +437,14 @@ Additional Configurations
   The latest release of ethtool can be found from
   http://sourceforge.net/projects/gkernel.
 
-  NOTE: Ethtool 1.6 only supports a limited set of ethtool options.  Support
-  for a more complete ethtool feature set can be enabled by upgrading
-  ethtool to ethtool-1.8.1.
-
   Enabling Wake on LAN* (WoL)
   ---------------------------
-  WoL is configured through the Ethtool* utility.  Ethtool is included with
-  all versions of Red Hat after Red Hat 7.2.  For other Linux distributions,
-  download and install Ethtool from the following website:
-  http://sourceforge.net/projects/gkernel.
-
-  For instructions on enabling WoL with Ethtool, refer to the website listed
-  above.
+  WoL is configured through the Ethtool* utility.
 
   WoL will be enabled on the system during the next shut down or reboot.
   For this driver version, in order to enable WoL, the e1000 driver must be
   loaded when shutting down or rebooting the system.
 
-  Wake On LAN is only supported on port A for the following devices:
-  Intel(R) PRO/1000 PT Dual Port Network Connection
-  Intel(R) PRO/1000 PT Dual Port Server Connection
-  Intel(R) PRO/1000 PT Dual Port Server Adapter
-  Intel(R) PRO/1000 PF Dual Port Server Adapter
-  Intel(R) PRO/1000 PT Quad Port Server Adapter
-
-  NAPI
-  ----
-  NAPI (Rx polling mode) is enabled in the e1000 driver.
-
-  See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
-
-
-Known Issues
-============
-
-Dropped Receive Packets on Half-duplex 10/100 Networks
-------------------------------------------------------
-If you have an Intel PCI Express adapter running at 10mbps or 100mbps, half-
-duplex, you may observe occasional dropped receive packets.  There are no
-workarounds for this problem in this network configuration.  The network must
-be updated to operate in full-duplex, and/or 1000mbps only.
-
-Jumbo Frames System Requirement
--------------------------------
-Memory allocation failures have been observed on Linux systems with 64 MB
-of RAM or less that are running Jumbo Frames.  If you are using Jumbo
-Frames, your system may require more than the advertised minimum
-requirement of 64 MB of system memory.
-
-Performance Degradation with Jumbo Frames
------------------------------------------
-Degradation in throughput performance may be observed in some Jumbo frames
-environments.  If this is observed, increasing the application's socket
-buffer size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values
-may help.  See the specific application manual and
-/usr/src/linux*/Documentation/
-networking/ip-sysctl.txt for more details.
-
-Jumbo Frames on Foundry BigIron 8000 switch
--------------------------------------------
-There is a known issue using Jumbo frames when connected to a Foundry
-BigIron 8000 switch.  This is a 3rd party limitation.  If you experience
-loss of packets, lower the MTU size.
-
-Allocating Rx Buffers when Using Jumbo Frames 
----------------------------------------------
-Allocating Rx buffers when using Jumbo Frames on 2.6.x kernels may fail if 
-the available memory is heavily fragmented. This issue may be seen with PCI-X 
-adapters or with packet split disabled. This can be reduced or eliminated 
-by changing the amount of available memory for receive buffer allocation, by
-increasing /proc/sys/vm/min_free_kbytes. 
-
-Multiple Interfaces on Same Ethernet Broadcast Network
-------------------------------------------------------
-Due to the default ARP behavior on Linux, it is not possible to have
-one system on two IP networks in the same Ethernet broadcast domain
-(non-partitioned switch) behave as expected.  All Ethernet interfaces
-will respond to IP traffic for any IP address assigned to the system.
-This results in unbalanced receive traffic.
-
-If you have multiple interfaces in a server, either turn on ARP
-filtering by entering:
-
-    echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
-(this only works if your kernel's version is higher than 2.4.5),
-
-NOTE: This setting is not saved across reboots.  The configuration
-change can be made permanent by adding the line:
-    net.ipv4.conf.all.arp_filter = 1
-to the file /etc/sysctl.conf
-
-      or,
-
-install the interfaces in separate broadcast domains (either in
-different switches or in a switch partitioned to VLANs).
-
-82541/82547 can't link or are slow to link with some link partners
------------------------------------------------------------------
-There is a known compatibility issue with 82541/82547 and some
-low-end switches where the link will not be established, or will
-be slow to establish.  In particular, these switches are known to
-be incompatible with 82541/82547:
-
-    Planex FXG-08TE
-    I-O Data ETG-SH8
-
-To workaround this issue, the driver can be compiled with an override
-of the PHY's master/slave setting.  Forcing master or forcing slave
-mode will improve time-to-link.
-
-    # make CFLAGS_EXTRA=-DE1000_MASTER_SLAVE=<n>
-
-Where <n> is:
-
-    0 = Hardware default
-    1 = Master mode
-    2 = Slave mode
-    3 = Auto master/slave
-
-Disable rx flow control with ethtool
-------------------------------------
-In order to disable receive flow control using ethtool, you must turn
-off auto-negotiation on the same command line.
-
-For example:
-
-   ethtool -A eth? autoneg off rx off
-
-Unplugging network cable while ethtool -p is running
-----------------------------------------------------
-In kernel versions 2.5.50 and later (including 2.6 kernel), unplugging
-the network cable while ethtool -p is running will cause the system to
-become unresponsive to keyboard commands, except for control-alt-delete.
-Restarting the system appears to be the only remedy.
-
-
 Support
 =======
 
diff --git a/Documentation/networking/e1000e.txt b/Documentation/networking/e1000e.txt
new file mode 100644 (file)
index 0000000..6aa048b
--- /dev/null
@@ -0,0 +1,302 @@
+Linux* Driver for Intel(R) Network Connection
+===============================================================
+
+Intel Gigabit Linux driver.
+Copyright(c) 1999 - 2010 Intel Corporation.
+
+Contents
+========
+
+- Identifying Your Adapter
+- Command Line Parameters
+- Additional Configurations
+- Support
+
+Identifying Your Adapter
+========================
+
+The e1000e driver supports all PCI Express Intel(R) Gigabit Network
+Connections, except those that are 82575, 82576 and 82580-based*.
+
+* NOTE: The Intel(R) PRO/1000 P Dual Port Server Adapter is supported by
+  the e1000 driver, not the e1000e driver due to the 82546 part being used
+  behind a PCI Express bridge.
+
+For more information on how to identify your adapter, go to the Adapter &
+Driver ID Guide at:
+
+    http://support.intel.com/support/go/network/adapter/idguide.htm
+
+For the latest Intel network drivers for Linux, refer to the following
+website.  In the search field, enter your adapter name or type, or use the
+networking link on the left to search for your adapter:
+
+    http://support.intel.com/support/go/network/adapter/home.htm
+
+Command Line Parameters
+=======================
+
+The default value for each parameter is generally the recommended setting,
+unless otherwise noted.
+
+NOTES:  For more information about the InterruptThrottleRate,
+        RxIntDelay, TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay
+        parameters, see the application note at:
+        http://www.intel.com/design/network/applnots/ap450.htm
+
+InterruptThrottleRate
+---------------------
+Valid Range:   0,1,3,4,100-100000 (0=off, 1=dynamic, 3=dynamic conservative,
+                                   4=simplified balancing)
+Default Value: 3
+
+The driver can limit the amount of interrupts per second that the adapter
+will generate for incoming packets. It does this by writing a value to the
+adapter that is based on the maximum amount of interrupts that the adapter
+will generate per second.
+
+Setting InterruptThrottleRate to a value greater or equal to 100
+will program the adapter to send out a maximum of that many interrupts
+per second, even if more packets have come in. This reduces interrupt
+load on the system and can lower CPU utilization under heavy load,
+but will increase latency as packets are not processed as quickly.
+
+The driver has two adaptive modes (setting 1 or 3) in which
+it dynamically adjusts the InterruptThrottleRate value based on the traffic
+that it receives. After determining the type of incoming traffic in the last
+timeframe, it will adjust the InterruptThrottleRate to an appropriate value
+for that traffic.
+
+The algorithm classifies the incoming traffic every interval into
+classes.  Once the class is determined, the InterruptThrottleRate value is
+adjusted to suit that traffic type the best. There are three classes defined:
+"Bulk traffic", for large amounts of packets of normal size; "Low latency",
+for small amounts of traffic and/or a significant percentage of small
+packets; and "Lowest latency", for almost completely small packets or
+minimal traffic.
+
+In dynamic conservative mode, the InterruptThrottleRate value is set to 4000
+for traffic that falls in class "Bulk traffic". If traffic falls in the "Low
+latency" or "Lowest latency" class, the InterruptThrottleRate is increased
+stepwise to 20000. This default mode is suitable for most applications.
+
+For situations where low latency is vital such as cluster or
+grid computing, the algorithm can reduce latency even more when
+InterruptThrottleRate is set to mode 1. In this mode, which operates
+the same as mode 3, the InterruptThrottleRate will be increased stepwise to
+70000 for traffic in class "Lowest latency".
+
+In simplified mode the interrupt rate is based on the ratio of Tx and
+Rx traffic.  If the bytes per second rate is approximately equal the
+interrupt rate will drop as low as 2000 interrupts per second.  If the
+traffic is mostly transmit or mostly receive, the interrupt rate could
+be as high as 8000.
+
+Setting InterruptThrottleRate to 0 turns off any interrupt moderation
+and may improve small packet latency, but is generally not suitable
+for bulk throughput traffic.
+
+NOTE:  InterruptThrottleRate takes precedence over the TxAbsIntDelay and
+       RxAbsIntDelay parameters.  In other words, minimizing the receive
+       and/or transmit absolute delays does not force the controller to
+       generate more interrupts than what the Interrupt Throttle Rate
+       allows.
+
+NOTE:  When e1000e is loaded with default settings and multiple adapters
+       are in use simultaneously, the CPU utilization may increase non-
+       linearly.  In order to limit the CPU utilization without impacting
+       the overall throughput, we recommend that you load the driver as
+       follows:
+
+           modprobe e1000e InterruptThrottleRate=3000,3000,3000
+
+       This sets the InterruptThrottleRate to 3000 interrupts/sec for
+       the first, second, and third instances of the driver.  The range
+       of 2000 to 3000 interrupts per second works on a majority of
+       systems and is a good starting point, but the optimal value will
+       be platform-specific.  If CPU utilization is not a concern, use
+       RX_POLLING (NAPI) and default driver settings.
+
+RxIntDelay
+----------
+Valid Range:   0-65535 (0=off)
+Default Value: 0
+
+This value delays the generation of receive interrupts in units of 1.024
+microseconds.  Receive interrupt reduction can improve CPU efficiency if
+properly tuned for specific network traffic.  Increasing this value adds
+extra latency to frame reception and can end up decreasing the throughput
+of TCP traffic.  If the system is reporting dropped receives, this value
+may be set too high, causing the driver to run out of available receive
+descriptors.
+
+CAUTION:  When setting RxIntDelay to a value other than 0, adapters may
+          hang (stop transmitting) under certain network conditions.  If
+          this occurs a NETDEV WATCHDOG message is logged in the system
+          event log.  In addition, the controller is automatically reset,
+          restoring the network connection.  To eliminate the potential
+          for the hang ensure that RxIntDelay is set to 0.
+
+RxAbsIntDelay
+-------------
+Valid Range:   0-65535 (0=off)
+Default Value: 8
+
+This value, in units of 1.024 microseconds, limits the delay in which a
+receive interrupt is generated.  Useful only if RxIntDelay is non-zero,
+this value ensures that an interrupt is generated after the initial
+packet is received within the set amount of time.  Proper tuning,
+along with RxIntDelay, may improve traffic throughput in specific network
+conditions.
+
+TxIntDelay
+----------
+Valid Range:   0-65535 (0=off)
+Default Value: 8
+
+This value delays the generation of transmit interrupts in units of
+1.024 microseconds.  Transmit interrupt reduction can improve CPU
+efficiency if properly tuned for specific network traffic.  If the
+system is reporting dropped transmits, this value may be set too high
+causing the driver to run out of available transmit descriptors.
+
+TxAbsIntDelay
+-------------
+Valid Range:   0-65535 (0=off)
+Default Value: 32
+
+This value, in units of 1.024 microseconds, limits the delay in which a
+transmit interrupt is generated.  Useful only if TxIntDelay is non-zero,
+this value ensures that an interrupt is generated after the initial
+packet is sent on the wire within the set amount of time.  Proper tuning,
+along with TxIntDelay, may improve traffic throughput in specific
+network conditions.
+
+Copybreak
+---------
+Valid Range:   0-xxxxxxx (0=off)
+Default Value: 256
+
+Driver copies all packets below or equaling this size to a fresh Rx
+buffer before handing it up the stack.
+
+This parameter is different than other parameters, in that it is a
+single (not 1,1,1 etc.) parameter applied to all driver instances and
+it is also available during runtime at
+/sys/module/e1000e/parameters/copybreak
+
+SmartPowerDownEnable
+--------------------
+Valid Range: 0-1
+Default Value:  0 (disabled)
+
+Allows PHY to turn off in lower power states. The user can set this parameter
+in supported chipsets.
+
+KumeranLockLoss
+---------------
+Valid Range: 0-1
+Default Value: 1 (enabled)
+
+This workaround skips resetting the PHY at shutdown for the initial
+silicon releases of ICH8 systems.
+
+IntMode
+-------
+Valid Range: 0-2 (0=legacy, 1=MSI, 2=MSI-X)
+Default Value: 2
+
+Allows changing the interrupt mode at module load time, without requiring a
+recompile. If the driver load fails to enable a specific interrupt mode, the
+driver will try other interrupt modes, from least to most compatible.  The
+interrupt order is MSI-X, MSI, Legacy.  If specifying MSI (IntMode=1)
+interrupts, only MSI and Legacy will be attempted.
+
+CrcStripping
+------------
+Valid Range: 0-1
+Default Value: 1 (enabled)
+
+Strip the CRC from received packets before sending up the network stack.  If
+you have a machine with a BMC enabled but cannot receive IPMI traffic after
+loading or enabling the driver, try disabling this feature.
+
+WriteProtectNVM
+---------------
+Valid Range: 0-1
+Default Value: 1 (enabled)
+
+Set the hardware to ignore all write/erase cycles to the GbE region in the
+ICHx NVM (non-volatile memory).  This feature can be disabled by the
+WriteProtectNVM module parameter (enabled by default) only after a hardware
+reset, but the machine must be power cycled before trying to enable writes.
+
+Note: the kernel boot option iomem=relaxed may need to be set if the kernel
+config option CONFIG_STRICT_DEVMEM=y, if the root user wants to write the
+NVM from user space via ethtool.
+
+Additional Configurations
+=========================
+
+  Jumbo Frames
+  ------------
+  Jumbo Frames support is enabled by changing the MTU to a value larger than
+  the default of 1500.  Use the ifconfig command to increase the MTU size.
+  For example:
+
+       ifconfig eth<x> mtu 9000 up
+
+  This setting is not saved across reboots.
+
+  Notes:
+
+  - The maximum MTU setting for Jumbo Frames is 9216.  This value coincides
+    with the maximum Jumbo Frames size of 9234 bytes.
+
+  - Using Jumbo Frames at 10 or 100 Mbps is not supported and may result in
+    poor performance or loss of link.
+
+  - Some adapters limit Jumbo Frames sized packets to a maximum of
+    4096 bytes and some adapters do not support Jumbo Frames.
+
+
+  Ethtool
+  -------
+  The driver utilizes the ethtool interface for driver configuration and
+  diagnostics, as well as displaying statistical information.  We
+  strongly recommend downloading the latest version of Ethtool at:
+
+  http://sourceforge.net/projects/gkernel.
+
+  Speed and Duplex
+  ----------------
+  Speed and Duplex are configured through the Ethtool* utility. For
+  instructions,  refer to the Ethtool man page.
+
+  Enabling Wake on LAN* (WoL)
+  ---------------------------
+  WoL is configured through the Ethtool* utility. For instructions on
+  enabling WoL with Ethtool, refer to the Ethtool man page.
+
+  WoL will be enabled on the system during the next shut down or reboot.
+  For this driver version, in order to enable WoL, the e1000e driver must be
+  loaded when shutting down or rebooting the system.
+
+  In most cases Wake On LAN is only supported on port A for multiple port
+  adapters. To verify if a port supports Wake on LAN run ethtool eth<X>.
+
+
+Support
+=======
+
+For general information, go to the Intel support website at:
+
+    www.intel.com/support/
+
+or the Intel Wired Networking project hosted by Sourceforge at:
+
+    http://sourceforge.net/projects/e1000
+
+If an issue is identified with the released source code on the supported
+kernel with a supported adapter, email the specific information related
+to the issue to e1000-devel@lists.sf.net
old mode 100755 (executable)
new mode 100644 (file)
index 19015de..21dd5d1
@@ -1,19 +1,16 @@
 Linux* Base Driver for Intel(R) Network Connection
 ==================================================
 
-November 24, 2009
+Intel Gigabit Linux driver.
+Copyright(c) 1999 - 2010 Intel Corporation.
 
 Contents
 ========
 
-- In This Release
 - Identifying Your Adapter
 - Known Issues/Troubleshooting
 - Support
 
-In This Release
-===============
-
 This file describes the ixgbevf Linux* Base Driver for Intel Network
 Connection.
 
@@ -33,7 +30,7 @@ Identifying Your Adapter
 For more information on how to identify your adapter, go to the Adapter &
 Driver ID Guide at:
 
-    http://support.intel.com/support/network/sb/CS-008441.htm
+    http://support.intel.com/support/go/network/adapter/idguide.htm
 
 Known Issues/Troubleshooting
 ============================
@@ -57,34 +54,3 @@ or the Intel Wired Networking project hosted by Sourceforge at:
 If an issue is identified with the released source code on the supported
 kernel with a supported adapter, email the specific information related
 to the issue to e1000-devel@lists.sf.net
-
-License
-=======
-
-Intel 10 Gigabit Linux driver.
-Copyright(c) 1999 - 2009 Intel Corporation.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms and conditions of the GNU General Public License,
-version 2, as published by the Free Software Foundation.
-
-This program is distributed in the hope it will be useful, but WITHOUT
-ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-more details.
-
-You should have received a copy of the GNU General Public License along with
-this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-The full GNU General Public License is included in this distribution in
-the file called "COPYING".
-
-Trademarks
-==========
-
-Intel, Itanium, and Pentium are trademarks or registered trademarks of
-Intel Corporation or its subsidiaries in the United States and other
-countries.
-
-* Other names and brands may be claimed as the property of others.
index ccd951f..cc96ee2 100644 (file)
@@ -478,7 +478,7 @@ static void prepare_hwpoison_fd(void)
        }
 
        if (opt_unpoison && !hwpoison_forget_fd) {
-               sprintf(buf, "%s/renew-pfn", hwpoison_debug_fs);
+               sprintf(buf, "%s/unpoison-pfn", hwpoison_debug_fs);
                hwpoison_forget_fd = checked_open(buf, O_WRONLY);
        }
 }
index f46d8e6..3d4179f 100644 (file)
@@ -969,6 +969,16 @@ L: linux-samsung-soc@vger.kernel.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/mach-s5p*/
 
+ARM/SAMSUNG S5P SERIES FIMC SUPPORT
+M:     Kyungmin Park <kyungmin.park@samsung.com>
+M:     Sylwester Nawrocki <s.nawrocki@samsung.com>
+L:     linux-arm-kernel@lists.infradead.org
+L:     linux-media@vger.kernel.org
+S:     Maintained
+F:     arch/arm/plat-s5p/dev-fimc*
+F:     arch/arm/plat-samsung/include/plat/*fimc*
+F:     drivers/media/video/s5p-fimc/
+
 ARM/SHMOBILE ARM ARCHITECTURE
 M:     Paul Mundt <lethal@linux-sh.org>
 M:     Magnus Damm <magnus.damm@gmail.com>
@@ -1517,6 +1527,8 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
 S:     Supported
 F:     Documentation/filesystems/ceph.txt
 F:     fs/ceph
+F:     net/ceph
+F:     include/linux/ceph
 
 CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
 M:     David Vrabel <david.vrabel@csr.com>
@@ -2535,7 +2547,7 @@ S:        Supported
 F:     drivers/scsi/gdt*
 
 GENERIC GPIO I2C DRIVER
-M:     Haavard Skinnemoen <hskinnemoen@atmel.com>
+M:     Haavard Skinnemoen <hskinnemoen@gmail.com>
 S:     Supported
 F:     drivers/i2c/busses/i2c-gpio.c
 F:     include/linux/i2c-gpio.h
@@ -3063,16 +3075,27 @@ L:      netdev@vger.kernel.org
 S:     Maintained
 F:     drivers/net/ixp2000/
 
-INTEL ETHERNET DRIVERS (e100/e1000/e1000e/igb/igbvf/ixgb/ixgbe)
+INTEL ETHERNET DRIVERS (e100/e1000/e1000e/igb/igbvf/ixgb/ixgbe/ixgbevf)
 M:     Jeff Kirsher <jeffrey.t.kirsher@intel.com>
 M:     Jesse Brandeburg <jesse.brandeburg@intel.com>
 M:     Bruce Allan <bruce.w.allan@intel.com>
-M:     Alex Duyck <alexander.h.duyck@intel.com>
+M:     Carolyn Wyborny <carolyn.wyborny@intel.com>
+M:     Don Skidmore <donald.c.skidmore@intel.com>
+M:     Greg Rose <gregory.v.rose@intel.com>
 M:     PJ Waskiewicz <peter.p.waskiewicz.jr@intel.com>
+M:     Alex Duyck <alexander.h.duyck@intel.com>
 M:     John Ronciak <john.ronciak@intel.com>
 L:     e1000-devel@lists.sourceforge.net
 W:     http://e1000.sourceforge.net/
 S:     Supported
+F:     Documentation/networking/e100.txt
+F:     Documentation/networking/e1000.txt
+F:     Documentation/networking/e1000e.txt
+F:     Documentation/networking/igb.txt
+F:     Documentation/networking/igbvf.txt
+F:     Documentation/networking/ixgb.txt
+F:     Documentation/networking/ixgbe.txt
+F:     Documentation/networking/ixgbevf.txt
 F:     drivers/net/e100.c
 F:     drivers/net/e1000/
 F:     drivers/net/e1000e/
@@ -3080,6 +3103,7 @@ F:        drivers/net/igb/
 F:     drivers/net/igbvf/
 F:     drivers/net/ixgb/
 F:     drivers/net/ixgbe/
+F:     drivers/net/ixgbevf/
 
 INTEL PRO/WIRELESS 2100 NETWORK CONNECTION SUPPORT
 L:     linux-wireless@vger.kernel.org
@@ -3140,7 +3164,7 @@ F:        drivers/net/ioc3-eth.c
 
 IOC3 SERIAL DRIVER
 M:     Pat Gefre <pfg@sgi.com>
-L:     linux-mips@linux-mips.org
+L:     linux-serial@vger.kernel.org
 S:     Maintained
 F:     drivers/serial/ioc3_serial.c
 
@@ -4783,6 +4807,15 @@ F:       fs/qnx4/
 F:     include/linux/qnx4_fs.h
 F:     include/linux/qnxtypes.h
 
+RADOS BLOCK DEVICE (RBD)
+F:     include/linux/qnxtypes.h
+M:     Yehuda Sadeh <yehuda@hq.newdream.net>
+M:     Sage Weil <sage@newdream.net>
+M:     ceph-devel@vger.kernel.org
+S:     Supported
+F:     drivers/block/rbd.c
+F:     drivers/block/rbd_types.h
+
 RADEON FRAMEBUFFER DISPLAY DRIVER
 M:     Benjamin Herrenschmidt <benh@kernel.crashing.org>
 L:     linux-fbdev@vger.kernel.org
@@ -5008,6 +5041,12 @@ F:       drivers/media/common/saa7146*
 F:     drivers/media/video/*7146*
 F:     include/media/*7146*
 
+SAMSUNG AUDIO (ASoC) DRIVERS
+M:     Jassi Brar <jassi.brar@samsung.com>
+L:     alsa-devel@alsa-project.org (moderated for non-subscribers)
+S:     Supported
+F:     sound/soc/s3c24xx
+
 TLG2300 VIDEO4LINUX-2 DRIVER
 M:     Huang Shijie <shijie8@gmail.com>
 M:     Kang Yong <kangyong@telegent.com>
@@ -6450,8 +6489,10 @@ F:       include/linux/wm97xx.h
 WOLFSON MICROELECTRONICS DRIVERS
 M:     Mark Brown <broonie@opensource.wolfsonmicro.com>
 M:     Ian Lartey <ian@opensource.wolfsonmicro.com>
+M:     Dimitris Papastamos <dp@opensource.wolfsonmicro.com>
+T:     git git://opensource.wolfsonmicro.com/linux-2.6-asoc
 T:     git git://opensource.wolfsonmicro.com/linux-2.6-audioplus
-W:     http://opensource.wolfsonmicro.com/node/8
+W:     http://opensource.wolfsonmicro.com/content/linux-drivers-wolfson-devices
 S:     Supported
 F:     Documentation/hwmon/wm83??
 F:     drivers/leds/leds-wm83*.c
index c2362c2..d3c1071 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 36
-EXTRAVERSION = -rc7
-NAME = Sheep on Meth
+EXTRAVERSION =
+NAME = Flesh-Eating Bats with Fangs
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
index 7c0dfcc..9103904 100644 (file)
@@ -1102,6 +1102,20 @@ config ARM_ERRATA_720789
          invalidated are not, resulting in an incoherency in the system page
          tables. The workaround changes the TLB flushing routines to invalidate
          entries regardless of the ASID.
+
+config ARM_ERRATA_743622
+       bool "ARM errata: Faulty hazard checking in the Store Buffer may lead to data corruption"
+       depends on CPU_V7
+       help
+         This option enables the workaround for the 743622 Cortex-A9
+         (r2p0..r2p2) erratum. Under very rare conditions, a faulty
+         optimisation in the Cortex-A9 Store Buffer may lead to data
+         corruption. This workaround sets a specific bit in the diagnostic
+         register of the Cortex-A9 which disables the Store Buffer
+         optimisation, preventing the defect from occurring. This has no
+         visible impact on the overall performance or power consumption of the
+         processor.
+
 endmenu
 
 source "arch/arm/common/Kconfig"
index 8bccbfa..2c1f005 100644 (file)
@@ -1162,11 +1162,12 @@ space_cccc_001x(kprobe_opcode_t insn, struct arch_specific_insn *asi)
 {
        /*
         * MSR   : cccc 0011 0x10 xxxx xxxx xxxx xxxx xxxx
-        * Undef : cccc 0011 0x00 xxxx xxxx xxxx xxxx xxxx
+        * Undef : cccc 0011 0100 xxxx xxxx xxxx xxxx xxxx
         * ALU op with S bit and Rd == 15 :
         *         cccc 001x xxx1 xxxx 1111 xxxx xxxx xxxx
         */
-       if ((insn & 0x0f900000) == 0x03200000 ||        /* MSR & Undef */
+       if ((insn & 0x0fb00000) == 0x03200000 ||        /* MSR */
+           (insn & 0x0ff00000) == 0x03400000 ||        /* Undef */
            (insn & 0x0e10f000) == 0x0210f000)          /* ALU s-bit, R15  */
                return INSN_REJECTED;
 
@@ -1177,7 +1178,7 @@ space_cccc_001x(kprobe_opcode_t insn, struct arch_specific_insn *asi)
         * *S (bit 20) updates condition codes
         * ADC/SBC/RSC reads the C flag
         */
-       insn &= 0xfff00fff;     /* Rn = r0, Rd = r0 */
+       insn &= 0xffff0fff;     /* Rd = r0 */
        asi->insn[0] = insn;
        asi->insn_handler = (insn & (1 << 20)) ?  /* S-bit */
                        emulate_alu_imm_rwflags : emulate_alu_imm_rflags;
index c80e090..ee8db15 100644 (file)
 
 static inline void arch_idle(void)
 {
-#ifndef CONFIG_DEBUG_KERNEL
        /*
         * Disable the processor clock.  The processor will be automatically
         * re-enabled by an interrupt or by a reset.
         */
        at91_sys_write(AT91_PMC_SCDR, AT91_PMC_PCK);
-#else
+#ifndef CONFIG_CPU_ARM920T
        /*
         * Set the processor (CP15) into 'Wait for Interrupt' mode.
-        * Unlike disabling the processor clock via the PMC (above)
-        *  this allows the processor to be woken via JTAG.
+        * Post-RM9200 processors need this in conjunction with the above
+        * to save power when idle.
         */
        cpu_do_idle();
 #endif
index 29c0a91..77eb35c 100644 (file)
@@ -691,7 +691,7 @@ int dma_init(void)
 
        memset(&gDMA, 0, sizeof(gDMA));
 
-       init_MUTEX_LOCKED(&gDMA.lock);
+       sema_init(&gDMA.lock, 0);
        init_waitqueue_head(&gDMA.freeChannelQ);
 
        /* Initialize the Hardware */
@@ -1574,7 +1574,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap)
 {
        memset(memMap, 0, sizeof(*memMap));
 
-       init_MUTEX(&memMap->lock);
+       sema_init(&memMap->lock, 1);
 
        return 0;
 }
index 8904ca4..a696d35 100644 (file)
@@ -276,7 +276,7 @@ static void channel_disable(struct m2p_channel *ch)
        v &= ~(M2P_CONTROL_STALL_IRQ_EN | M2P_CONTROL_NFB_IRQ_EN);
        m2p_set_control(ch, v);
 
-       while (m2p_channel_state(ch) == STATE_ON)
+       while (m2p_channel_state(ch) >= STATE_ON)
                cpu_relax();
 
        m2p_set_control(ch, 0x0);
index c5c0369..2f7e272 100644 (file)
@@ -122,6 +122,7 @@ config MACH_CPUIMX27
        select IMX_HAVE_PLATFORM_IMX_I2C
        select IMX_HAVE_PLATFORM_IMX_UART
        select IMX_HAVE_PLATFORM_MXC_NAND
+       select MXC_ULPI if USB_ULPI
        help
          Include support for Eukrea CPUIMX27 platform. This includes
          specific configurations for the module and its peripherals.
index 339150a..6830afd 100644 (file)
@@ -259,7 +259,7 @@ static void __init eukrea_cpuimx27_init(void)
        i2c_register_board_info(0, eukrea_cpuimx27_i2c_devices,
                                ARRAY_SIZE(eukrea_cpuimx27_i2c_devices));
 
-       imx27_add_i2c_imx1(&cpuimx27_i2c1_data);
+       imx27_add_i2c_imx0(&cpuimx27_i2c1_data);
 
        platform_add_devices(platform_devices, ARRAY_SIZE(platform_devices));
 
index 526f33a..ec592e8 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/sysdev.h>
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
index a48fb55..70ac681 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/sysdev.h>
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
index 251c92a..cd1afbc 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/sysdev.h>
 #include <linux/serial_core.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
index cfecd70..d562670 100644 (file)
@@ -173,11 +173,6 @@ static int s5pv210_clk_ip3_ctrl(struct clk *clk, int enable)
        return s5p_gatectrl(S5P_CLKGATE_IP3, clk, enable);
 }
 
-static int s5pv210_clk_ip4_ctrl(struct clk *clk, int enable)
-{
-       return s5p_gatectrl(S5P_CLKGATE_IP4, clk, enable);
-}
-
 static int s5pv210_clk_mask0_ctrl(struct clk *clk, int enable)
 {
        return s5p_gatectrl(S5P_CLK_SRC_MASK0, clk, enable);
index 77f456c..245b82b 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/io.h>
 #include <linux/sysdev.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
index efb1270..71fb173 100644 (file)
@@ -68,7 +68,7 @@ static void __init ct_ca9x4_init_irq(void)
 }
 
 #if 0
-static void ct_ca9x4_timer_init(void)
+static void __init ct_ca9x4_timer_init(void)
 {
        writel(0, MMIO_P2V(CT_CA9X4_TIMER0) + TIMER_CTRL);
        writel(0, MMIO_P2V(CT_CA9X4_TIMER1) + TIMER_CTRL);
@@ -222,7 +222,7 @@ static struct platform_device pmu_device = {
        .resource       = pmu_resources,
 };
 
-static void ct_ca9x4_init(void)
+static void __init ct_ca9x4_init(void)
 {
        int i;
 
index 817f0ad..7eaa232 100644 (file)
@@ -48,7 +48,7 @@ void __init v2m_map_io(struct map_desc *tile, size_t num)
 }
 
 
-static void v2m_timer_init(void)
+static void __init v2m_timer_init(void)
 {
        writel(0, MMIO_P2V(V2M_TIMER0) + TIMER_CTRL);
        writel(0, MMIO_P2V(V2M_TIMER1) + TIMER_CTRL);
index ab50627..17e7b0b 100644 (file)
@@ -204,8 +204,12 @@ void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
        /*
         * Don't allow RAM to be mapped - this causes problems with ARMv6+
         */
-       if (WARN_ON(pfn_valid(pfn)))
-               return NULL;
+       if (pfn_valid(pfn)) {
+               printk(KERN_WARNING "BUG: Your driver calls ioremap() on system memory.  This leads\n"
+                      KERN_WARNING "to architecturally unpredictable behaviour on ARMv6+, and ioremap()\n"
+                      KERN_WARNING "will fail in the next kernel release.  Please fix your driver.\n");
+               WARN_ON(1);
+       }
 
        type = get_mem_type(mtype);
        if (!type)
index 6a3a2d0..e8ed9dc 100644 (file)
@@ -248,7 +248,7 @@ static struct mem_type mem_types[] = {
        },
        [MT_MEMORY] = {
                .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
-                               L_PTE_USER | L_PTE_EXEC,
+                               L_PTE_WRITE | L_PTE_EXEC,
                .prot_l1   = PMD_TYPE_TABLE,
                .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
                .domain    = DOMAIN_KERNEL,
@@ -259,7 +259,7 @@ static struct mem_type mem_types[] = {
        },
        [MT_MEMORY_NONCACHED] = {
                .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
-                               L_PTE_USER | L_PTE_EXEC | L_PTE_MT_BUFFERABLE,
+                               L_PTE_WRITE | L_PTE_EXEC | L_PTE_MT_BUFFERABLE,
                .prot_l1   = PMD_TYPE_TABLE,
                .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
                .domain    = DOMAIN_KERNEL,
index 7563ff0..197f21b 100644 (file)
@@ -253,6 +253,14 @@ __v7_setup:
        orreq   r10, r10, #1 << 22              @ set bit #22
        mcreq   p15, 0, r10, c15, c0, 1         @ write diagnostic register
 #endif
+#ifdef CONFIG_ARM_ERRATA_743622
+       teq     r6, #0x20                       @ present in r2p0
+       teqne   r6, #0x21                       @ present in r2p1
+       teqne   r6, #0x22                       @ present in r2p2
+       mrceq   p15, 0, r10, c15, c0, 1         @ read diagnostic register
+       orreq   r10, r10, #1 << 6               @ set bit #6
+       mcreq   p15, 0, r10, c15, c0, 1         @ write diagnostic register
+#endif
 
 3:     mov     r10, #0
 #ifdef HARVARD_CACHE
@@ -365,7 +373,7 @@ __v7_ca9mp_proc_info:
        b       __v7_ca9mp_setup
        .long   cpu_arch_name
        .long   cpu_elf_name
-       .long   HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP
+       .long   HWCAP_SWP|HWCAP_HALF|HWCAP_THUMB|HWCAP_FAST_MULT|HWCAP_EDSP|HWCAP_TLS
        .long   cpu_v7_name
        .long   v7_processor_functions
        .long   v7wbi_tlb_fns
index a202a2c..6cd151b 100644 (file)
@@ -320,6 +320,7 @@ void flush_iotlb_page(struct iommu *obj, u32 da)
                if ((start <= da) && (da < start + bytes)) {
                        dev_dbg(obj->dev, "%s: %08x<=%08x(%x)\n",
                                __func__, start, da, bytes);
+                       iotlb_load_cr(obj, &cr);
                        iommu_write_reg(obj, 1, MMU_FLUSH_ENTRY);
                }
        }
index 04d9521..e8f2be2 100644 (file)
@@ -435,7 +435,6 @@ static int s3c_adc_suspend(struct platform_device *pdev, pm_message_t state)
 static int s3c_adc_resume(struct platform_device *pdev)
 {
        struct adc_device *adc = platform_get_drvdata(pdev);
-       unsigned long flags;
 
        clk_enable(adc->clk);
        enable_irq(adc->irq);
index 90a2051..e8d20b0 100644 (file)
@@ -48,6 +48,9 @@
 #include <plat/clock.h>
 #include <plat/cpu.h>
 
+#include <linux/serial_core.h>
+#include <plat/regs-serial.h> /* for s3c24xx_uart_devs */
+
 /* clock information */
 
 static LIST_HEAD(clocks);
@@ -65,6 +68,28 @@ static int clk_null_enable(struct clk *clk, int enable)
        return 0;
 }
 
+static int dev_is_s3c_uart(struct device *dev)
+{
+       struct platform_device **pdev = s3c24xx_uart_devs;
+       int i;
+       for (i = 0; i < ARRAY_SIZE(s3c24xx_uart_devs); i++, pdev++)
+               if (*pdev && dev == &(*pdev)->dev)
+                       return 1;
+       return 0;
+}
+
+/*
+ * Serial drivers call get_clock() very early, before platform bus
+ * has been set up, this requires a special check to let them get
+ * a proper clock
+ */
+
+static int dev_is_platform_device(struct device *dev)
+{
+       return dev->bus == &platform_bus_type ||
+              (dev->bus == NULL && dev_is_s3c_uart(dev));
+}
+
 /* Clock API calls */
 
 struct clk *clk_get(struct device *dev, const char *id)
@@ -73,7 +98,7 @@ struct clk *clk_get(struct device *dev, const char *id)
        struct clk *clk = ERR_PTR(-ENOENT);
        int idno;
 
-       if (dev == NULL || dev->bus != &platform_bus_type)
+       if (dev == NULL || !dev_is_platform_device(dev))
                idno = -1;
        else
                idno = to_platform_device(dev)->id;
index 2f85412..b8da7d0 100644 (file)
@@ -82,9 +82,9 @@ typedef elf_fpreg_t elf_fpregset_t;
  * These are used to set parameters in the core dumps.
  */
 #define ELF_CLASS      ELFCLASS32
-#if defined(__LITTLE_ENDIAN)
+#if defined(__LITTLE_ENDIAN__)
 #define ELF_DATA       ELFDATA2LSB
-#elif defined(__BIG_ENDIAN)
+#elif defined(__BIG_ENDIAN__)
 #define ELF_DATA       ELFDATA2MSB
 #else
 #error no endian defined
diff --git a/arch/m32r/kernel/.gitignore b/arch/m32r/kernel/.gitignore
new file mode 100644 (file)
index 0000000..c5f676c
--- /dev/null
@@ -0,0 +1 @@
+vmlinux.lds
index 7bbe386..a08697f 100644 (file)
@@ -28,6 +28,8 @@
 
 #define DEBUG_SIG 0
 
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
 asmlinkage int
 sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
                unsigned long r2, unsigned long r3, unsigned long r4,
@@ -254,7 +256,7 @@ give_sigsegv:
 static int prev_insn(struct pt_regs *regs)
 {
        u16 inst;
-       if (get_user(&inst, (u16 __user *)(regs->bpc - 2)))
+       if (get_user(inst, (u16 __user *)(regs->bpc - 2)))
                return -EFAULT;
        if ((inst & 0xfff0) == 0x10f0)  /* trap ? */
                regs->bpc -= 2;
index e322d65..7dd65cf 100644 (file)
@@ -7,6 +7,10 @@ subdir-ccflags-y := -Werror
 include arch/mips/Kbuild.platforms
 obj-y := $(platform-y)
 
+# make clean traverses $(obj-) without having included .config, so
+# everything ends up here
+obj- := $(platform-)
+
 # mips object files
 # The object files are linked as core-y files would be linked
 
index 5526faa..4c9f402 100644 (file)
@@ -881,11 +881,15 @@ config NO_IOPORT
 config GENERIC_ISA_DMA
        bool
        select ZONE_DMA if GENERIC_ISA_DMA_SUPPORT_BROKEN=n
+       select ISA_DMA_API
 
 config GENERIC_ISA_DMA_SUPPORT_BROKEN
        bool
        select GENERIC_ISA_DMA
 
+config ISA_DMA_API
+       bool
+
 config GENERIC_GPIO
        bool
 
index 5fd7f7a..5042d51 100644 (file)
@@ -105,4 +105,4 @@ OBJCOPYFLAGS_vmlinuz.srec := $(OBJCOPYFLAGS) -S -O srec
 vmlinuz.srec: vmlinuz
        $(call cmd,objcopy)
 
-clean-files := $(objtree)/vmlinuz.*
+clean-files := $(objtree)/vmlinuz $(objtree)/vmlinuz.{32,ecoff,bin,srec}
index 3adbcbd..cf55a6f 100644 (file)
@@ -1,7 +1,7 @@
 #
 # DECstation family
 #
-platform-$(CONFIG_MACH_DECSTATION)     = dec/
+platform-$(CONFIG_MACH_DECSTATION)     += dec/
 cflags-$(CONFIG_MACH_DECSTATION)       += \
                        -I$(srctree)/arch/mips/include/asm/mach-dec
 libs-$(CONFIG_MACH_DECSTATION)         += arch/mips/dec/prom/
index e482fe9..75edded 100644 (file)
@@ -56,6 +56,7 @@
  */
 
 #ifdef CONFIG_32BIT
+#include <linux/types.h>
 
 struct flock {
        short   l_type;
index 96e28f1..1ca64b4 100644 (file)
@@ -88,6 +88,7 @@ typedef struct siginfo {
 #ifdef __ARCH_SI_TRAPNO
                        int _trapno;    /* TRAP # which caused the signal */
 #endif
+                       short _addr_lsb;
                } _sigfault;
 
                /* SIGPOLL, SIGXFSZ (To do ...)  */
index 6a97230..ba91be9 100644 (file)
@@ -1,3 +1,3 @@
-core-$(CONFIG_MACH_JZ4740)     += arch/mips/jz4740/
+platform-$(CONFIG_MACH_JZ4740) += jz4740/
 cflags-$(CONFIG_MACH_JZ4740)   += -I$(srctree)/arch/mips/include/asm/mach-jz4740
 load-$(CONFIG_MACH_JZ4740)     += 0xffffffff80010000
index 0176ed0..32103cc 100644 (file)
@@ -40,7 +40,6 @@ int __compute_return_epc(struct pt_regs *regs)
                return -EFAULT;
        }
 
-       regs->regs[0] = 0;
        switch (insn.i_format.opcode) {
        /*
         * jr and jalr are in r_format format.
index 2340f11..9a526ba 100644 (file)
@@ -103,7 +103,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
 
-       retval = security_task_setscheduler(p, 0, NULL);
+       retval = security_task_setscheduler(p)
        if (retval)
                goto out_unlock;
 
index c51b95f..c877733 100644 (file)
@@ -536,7 +536,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
        /* do the secure computing check first */
        if (!entryexit)
-               secure_computing(regs->regs[0]);
+               secure_computing(regs->regs[2]);
 
        if (unlikely(current->audit_context) && entryexit)
                audit_syscall_exit(AUDITSC_RESULT(regs->regs[2]),
@@ -565,7 +565,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
 
 out:
        if (unlikely(current->audit_context) && !entryexit)
-               audit_syscall_entry(audit_arch(), regs->regs[0],
+               audit_syscall_entry(audit_arch(), regs->regs[2],
                                    regs->regs[4], regs->regs[5],
                                    regs->regs[6], regs->regs[7]);
 }
index 584415e..fbaabad 100644 (file)
@@ -63,9 +63,9 @@ stack_done:
        sw      t0, PT_R7(sp)           # set error flag
        beqz    t0, 1f
 
+       lw      t1, PT_R2(sp)           # syscall number
        negu    v0                      # error
-       sw      v0, PT_R0(sp)           # set flag for syscall
-                                       # restarting
+       sw      t1, PT_R0(sp)           # save it for syscall restarting
 1:     sw      v0, PT_R2(sp)           # result
 
 o32_syscall_exit:
@@ -104,9 +104,9 @@ syscall_trace_entry:
        sw      t0, PT_R7(sp)           # set error flag
        beqz    t0, 1f
 
+       lw      t1, PT_R2(sp)           # syscall number
        negu    v0                      # error
-       sw      v0, PT_R0(sp)           # set flag for syscall
-                                       # restarting
+       sw      t1, PT_R0(sp)           # save it for syscall restarting
 1:     sw      v0, PT_R2(sp)           # result
 
        j       syscall_exit
@@ -169,8 +169,7 @@ stackargs:
         * We probably should handle this case a bit more drastic.
         */
 bad_stack:
-       negu    v0                              # error
-       sw      v0, PT_R0(sp)
+       li      v0, EFAULT
        sw      v0, PT_R2(sp)
        li      t0, 1                           # set error flag
        sw      t0, PT_R7(sp)
index 5573f8e..3f41792 100644 (file)
@@ -66,9 +66,9 @@ NESTED(handle_sys64, PT_SIZE, sp)
        sd      t0, PT_R7(sp)           # set error flag
        beqz    t0, 1f
 
+       ld      t1, PT_R2(sp)           # syscall number
        dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall
-                                       # restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
 1:     sd      v0, PT_R2(sp)           # result
 
 n64_syscall_exit:
@@ -109,8 +109,9 @@ syscall_trace_entry:
        sd      t0, PT_R7(sp)           # set error flag
        beqz    t0, 1f
 
+       ld      t1, PT_R2(sp)           # syscall number
        dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
 1:     sd      v0, PT_R2(sp)           # result
 
        j       syscall_exit
index 1e38ec9..f08ece6 100644 (file)
@@ -65,8 +65,9 @@ NESTED(handle_sysn32, PT_SIZE, sp)
        sd      t0, PT_R7(sp)           # set error flag
        beqz    t0, 1f
 
+       ld      t1, PT_R2(sp)           # syscall number
        dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
 1:     sd      v0, PT_R2(sp)           # result
 
        local_irq_disable               # make sure need_resched and
@@ -106,8 +107,9 @@ n32_syscall_trace_entry:
        sd      t0, PT_R7(sp)           # set error flag
        beqz    t0, 1f
 
+       ld      t1, PT_R2(sp)           # syscall number
        dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
 1:     sd      v0, PT_R2(sp)           # result
 
        j       syscall_exit
@@ -320,10 +322,10 @@ EXPORT(sysn32_call_table)
        PTR     sys_cacheflush
        PTR     sys_cachectl
        PTR     sys_sysmips
-       PTR     sys_io_setup                    /* 6200 */
+       PTR     compat_sys_io_setup                     /* 6200 */
        PTR     sys_io_destroy
-       PTR     sys_io_getevents
-       PTR     sys_io_submit
+       PTR     compat_sys_io_getevents
+       PTR     compat_sys_io_submit
        PTR     sys_io_cancel
        PTR     sys_exit_group                  /* 6205 */
        PTR     sys_lookup_dcookie
index 171979f..78d768a 100644 (file)
@@ -93,8 +93,9 @@ NESTED(handle_sys, PT_SIZE, sp)
        sd      t0, PT_R7(sp)           # set error flag
        beqz    t0, 1f
 
+       ld      t1, PT_R2(sp)           # syscall number
        dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
 1:     sd      v0, PT_R2(sp)           # result
 
 o32_syscall_exit:
@@ -142,8 +143,9 @@ trace_a_syscall:
        sd      t0, PT_R7(sp)           # set error flag
        beqz    t0, 1f
 
+       ld      t1, PT_R2(sp)           # syscall number
        dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
 1:     sd      v0, PT_R2(sp)           # result
 
        j       syscall_exit
@@ -154,8 +156,7 @@ trace_a_syscall:
         * The stackpointer for a call with more than 4 arguments is bad.
         */
 bad_stack:
-       dnegu   v0                      # error
-       sd      v0, PT_R0(sp)
+       li      v0, EFAULT
        sd      v0, PT_R2(sp)
        li      t0, 1                   # set error flag
        sd      t0, PT_R7(sp)
@@ -444,10 +445,10 @@ sys_call_table:
        PTR     compat_sys_futex
        PTR     compat_sys_sched_setaffinity
        PTR     compat_sys_sched_getaffinity    /* 4240 */
-       PTR     sys_io_setup
+       PTR     compat_sys_io_setup
        PTR     sys_io_destroy
-       PTR     sys_io_getevents
-       PTR     sys_io_submit
+       PTR     compat_sys_io_getevents
+       PTR     compat_sys_io_submit
        PTR     sys_io_cancel                   /* 4245 */
        PTR     sys_exit_group
        PTR     sys32_lookup_dcookie
index 2099d5a..5922342 100644 (file)
@@ -390,7 +390,6 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
 {
        struct rt_sigframe __user *frame;
        sigset_t set;
-       stack_t st;
        int sig;
 
        frame = (struct rt_sigframe __user *) regs.regs[29];
@@ -411,11 +410,9 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
        else if (sig)
                force_sig(sig, current);
 
-       if (__copy_from_user(&st, &frame->rs_uc.uc_stack, sizeof(st)))
-               goto badframe;
        /* It is more difficult to avoid calling this function than to
           call it and ignore errors.  */
-       do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]);
+       do_sigaltstack(&frame->rs_uc.uc_stack, NULL, regs.regs[29]);
 
        /*
         * Don't let your children do this ...
@@ -550,23 +547,26 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
        struct mips_abi *abi = current->thread.abi;
        void *vdso = current->mm->context.vdso;
 
-       switch(regs->regs[0]) {
-       case ERESTART_RESTARTBLOCK:
-       case ERESTARTNOHAND:
-               regs->regs[2] = EINTR;
-               break;
-       case ERESTARTSYS:
-               if (!(ka->sa.sa_flags & SA_RESTART)) {
+       if (regs->regs[0]) {
+               switch(regs->regs[2]) {
+               case ERESTART_RESTARTBLOCK:
+               case ERESTARTNOHAND:
                        regs->regs[2] = EINTR;
                        break;
+               case ERESTARTSYS:
+                       if (!(ka->sa.sa_flags & SA_RESTART)) {
+                               regs->regs[2] = EINTR;
+                               break;
+                       }
+               /* fallthrough */
+               case ERESTARTNOINTR:
+                       regs->regs[7] = regs->regs[26];
+                       regs->regs[2] = regs->regs[0];
+                       regs->cp0_epc -= 4;
                }
-       /* fallthrough */
-       case ERESTARTNOINTR:            /* Userland will reload $v0.  */
-               regs->regs[7] = regs->regs[26];
-               regs->cp0_epc -= 8;
-       }
 
-       regs->regs[0] = 0;              /* Don't deal with this again.  */
+               regs->regs[0] = 0;              /* Don't deal with this again.  */
+       }
 
        if (sig_uses_siginfo(ka))
                ret = abi->setup_rt_frame(vdso + abi->rt_signal_return_offset,
@@ -575,6 +575,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
                ret = abi->setup_frame(vdso + abi->signal_return_offset,
                                       ka, regs, sig, oldset);
 
+       if (ret)
+               return ret;
+
        spin_lock_irq(&current->sighand->siglock);
        sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
        if (!(ka->sa.sa_flags & SA_NODEFER))
@@ -622,17 +625,13 @@ static void do_signal(struct pt_regs *regs)
                return;
        }
 
-       /*
-        * Who's code doesn't conform to the restartable syscall convention
-        * dies here!!!  The li instruction, a single machine instruction,
-        * must directly be followed by the syscall instruction.
-        */
        if (regs->regs[0]) {
                if (regs->regs[2] == ERESTARTNOHAND ||
                    regs->regs[2] == ERESTARTSYS ||
                    regs->regs[2] == ERESTARTNOINTR) {
+                       regs->regs[2] = regs->regs[0];
                        regs->regs[7] = regs->regs[26];
-                       regs->cp0_epc -= 8;
+                       regs->cp0_epc -= 4;
                }
                if (regs->regs[2] == ERESTART_RESTARTBLOCK) {
                        regs->regs[2] = current->thread.abi->restart;
index 2c5df81..ee24d81 100644 (file)
@@ -109,6 +109,7 @@ asmlinkage int sysn32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs)
 asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
 {
        struct rt_sigframe_n32 __user *frame;
+       mm_segment_t old_fs;
        sigset_t set;
        stack_t st;
        s32 sp;
@@ -143,7 +144,11 @@ asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
 
        /* It is more difficult to avoid calling this function than to
           call it and ignore errors.  */
+       old_fs = get_fs();
+       set_fs(KERNEL_DS);
        do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]);
+       set_fs(old_fs);
+
 
        /*
         * Don't let your children do this ...
index 69b039c..33d5a5c 100644 (file)
@@ -109,8 +109,6 @@ static void emulate_load_store_insn(struct pt_regs *regs,
        unsigned long value;
        unsigned int res;
 
-       regs->regs[0] = 0;
-
        /*
         * This load never faults.
         */
index 0c46e39..63c740a 100644 (file)
@@ -40,6 +40,11 @@ static char *mixer = HOSTAUDIO_DEV_MIXER;
 "    This is used to specify the host mixer device to the hostaudio driver.\n"\
 "    The default is \"" HOSTAUDIO_DEV_MIXER "\".\n\n"
 
+module_param(dsp, charp, 0644);
+MODULE_PARM_DESC(dsp, DSP_HELP);
+module_param(mixer, charp, 0644);
+MODULE_PARM_DESC(mixer, MIXER_HELP);
+
 #ifndef MODULE
 static int set_dsp(char *name, int *add)
 {
@@ -56,15 +61,6 @@ static int set_mixer(char *name, int *add)
 }
 
 __uml_setup("mixer=", set_mixer, "mixer=<mixer device>\n" MIXER_HELP);
-
-#else /*MODULE*/
-
-module_param(dsp, charp, 0644);
-MODULE_PARM_DESC(dsp, DSP_HELP);
-
-module_param(mixer, charp, 0644);
-MODULE_PARM_DESC(mixer, MIXER_HELP);
-
 #endif
 
 /* /dev/dsp file operations */
index 1bcd208..9734994 100644 (file)
@@ -163,6 +163,7 @@ struct ubd {
        struct scatterlist sg[MAX_SG];
        struct request *request;
        int start_sg, end_sg;
+       sector_t rq_pos;
 };
 
 #define DEFAULT_COW { \
@@ -187,6 +188,7 @@ struct ubd {
        .request =              NULL, \
        .start_sg =             0, \
        .end_sg =               0, \
+       .rq_pos =               0, \
 }
 
 /* Protected by ubd_lock */
@@ -1228,7 +1230,6 @@ static void do_ubd_request(struct request_queue *q)
 {
        struct io_thread_req *io_req;
        struct request *req;
-       sector_t sector;
        int n;
 
        while(1){
@@ -1239,12 +1240,12 @@ static void do_ubd_request(struct request_queue *q)
                                return;
 
                        dev->request = req;
+                       dev->rq_pos = blk_rq_pos(req);
                        dev->start_sg = 0;
                        dev->end_sg = blk_rq_map_sg(q, req, dev->sg);
                }
 
                req = dev->request;
-               sector = blk_rq_pos(req);
                while(dev->start_sg < dev->end_sg){
                        struct scatterlist *sg = &dev->sg[dev->start_sg];
 
@@ -1256,10 +1257,9 @@ static void do_ubd_request(struct request_queue *q)
                                return;
                        }
                        prepare_request(req, io_req,
-                                       (unsigned long long)sector << 9,
+                                       (unsigned long long)dev->rq_pos << 9,
                                        sg->offset, sg->length, sg_page(sg));
 
-                       sector += sg->length >> 9;
                        n = os_write_file(thread_fd, &io_req,
                                          sizeof(struct io_thread_req *));
                        if(n != sizeof(struct io_thread_req *)){
@@ -1272,6 +1272,7 @@ static void do_ubd_request(struct request_queue *q)
                                return;
                        }
 
+                       dev->rq_pos += sg->length >> 9;
                        dev->start_sg++;
                }
                dev->end_sg = 0;
index 0350311..2d93bdb 100644 (file)
@@ -34,7 +34,7 @@
 #include <asm/ia32.h>
 
 #undef WARN_OLD
-#undef CORE_DUMP /* probably broken */
+#undef CORE_DUMP /* definitely broken */
 
 static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs);
 static int load_aout_library(struct file *);
@@ -131,21 +131,15 @@ static void set_brk(unsigned long start, unsigned long end)
  * macros to write out all the necessary info.
  */
 
-static int dump_write(struct file *file, const void *addr, int nr)
-{
-       return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
+#include <linux/coredump.h>
 
 #define DUMP_WRITE(addr, nr)                        \
        if (!dump_write(file, (void *)(addr), (nr))) \
                goto end_coredump;
 
-#define DUMP_SEEK(offset)                                              \
-       if (file->f_op->llseek) {                                       \
-               if (file->f_op->llseek(file, (offset), 0) != (offset))  \
-                       goto end_coredump;                              \
-       } else                                                          \
-               file->f_pos = (offset)
+#define DUMP_SEEK(offset)              \
+       if (!dump_seek(file, offset))   \
+               goto end_coredump;
 
 #define START_DATA()   (u.u_tsize << PAGE_SHIFT)
 #define START_STACK(u) (u.start_stack)
@@ -217,12 +211,6 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
                dump_size = dump.u_ssize << PAGE_SHIFT;
                DUMP_WRITE(dump_start, dump_size);
        }
-       /*
-        * Finally dump the task struct.  Not be used by gdb, but
-        * could be useful
-        */
-       set_fs(KERNEL_DS);
-       DUMP_WRITE(current, sizeof(*current));
 end_coredump:
        set_fs(fs);
        return has_dumped;
index 5af2982..f16a2ca 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
index cb03037..916bc81 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify it
index 0861618..e3509fc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
@@ -416,13 +416,22 @@ struct amd_iommu {
        struct dma_ops_domain *default_dom;
 
        /*
-        * This array is required to work around a potential BIOS bug.
-        * The BIOS may miss to restore parts of the PCI configuration
-        * space when the system resumes from S3. The result is that the
-        * IOMMU does not execute commands anymore which leads to system
-        * failure.
+        * We can't rely on the BIOS to restore all values on reinit, so we
+        * need to stash them
         */
-       u32 cache_cfg[4];
+
+       /* The iommu BAR */
+       u32 stored_addr_lo;
+       u32 stored_addr_hi;
+
+       /*
+        * Each iommu has 6 l1s, each of which is documented as having 0x12
+        * registers
+        */
+       u32 stored_l1[6][0x12];
+
+       /* The l2 indirect registers */
+       u32 stored_l2[0x83];
 };
 
 /*
index 4ac5b0f..bf357f9 100644 (file)
@@ -17,6 +17,7 @@ extern int fix_aperture;
 #define GARTEN         (1<<0)
 #define DISGARTCPU     (1<<4)
 #define DISGARTIO      (1<<5)
+#define DISTLBWALKPRB  (1<<6)
 
 /* GART cache control register bits. */
 #define INVGART                (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
 #define AMD64_GARTAPERTUREBASE 0x94
 #define AMD64_GARTTABLEBASE    0x98
 #define AMD64_GARTCACHECTL     0x9c
-#define AMD64_GARTEN           (1<<0)
 
 #ifdef CONFIG_GART_IOMMU
 extern int gart_iommu_aperture;
@@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void)
 
 extern int agp_amd64_init(void);
 
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+       u32 ctl;
+
+       /*
+        * Don't enable translation but enable GART IO and CPU accesses.
+        * Also, set DISTLBWALKPRB since GART tables memory is UC.
+        */
+       ctl = DISTLBWALKPRB | order << 1;
+
+       pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
 static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
 {
        u32 tmp, ctl;
index 502e53f..c52e2eb 100644 (file)
@@ -652,20 +652,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
        return (struct kvm_mmu_page *)page_private(page);
 }
 
-static inline u16 kvm_read_fs(void)
-{
-       u16 seg;
-       asm("mov %%fs, %0" : "=g"(seg));
-       return seg;
-}
-
-static inline u16 kvm_read_gs(void)
-{
-       u16 seg;
-       asm("mov %%gs, %0" : "=g"(seg));
-       return seg;
-}
-
 static inline u16 kvm_read_ldt(void)
 {
        u16 ldt;
@@ -673,16 +659,6 @@ static inline u16 kvm_read_ldt(void)
        return ldt;
 }
 
-static inline void kvm_load_fs(u16 sel)
-{
-       asm("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void kvm_load_gs(u16 sel)
-{
-       asm("mov %0, %%gs" : : "rm"(sel));
-}
-
 static inline void kvm_load_ldt(u16 sel)
 {
        asm("lldt %0" : : "rm"(sel));
index 679b645..d2fdb08 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
index 5a170cb..3cb482e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
  * Author: Joerg Roedel <joerg.roedel@amd.com>
  *         Leo Duran <leo.duran@amd.com>
  *
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
        return 1UL << shift;
 }
 
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+       u32 val;
+
+       pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+       pci_read_config_dword(iommu->dev, 0xfc, &val);
+       return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+       pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+       pci_write_config_dword(iommu->dev, 0xfc, val);
+       pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+       u32 val;
+
+       pci_write_config_dword(iommu->dev, 0xf0, address);
+       pci_read_config_dword(iommu->dev, 0xf4, &val);
+       return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+       pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+       pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
 /****************************************************************************
  *
  * AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 {
        int cap_ptr = iommu->cap_ptr;
        u32 range, misc;
+       int i, j;
 
        pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
                              &iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
                                        MMIO_GET_LD(range));
        iommu->evt_msi_num = MMIO_MSI_NUM(misc);
 
-       if (is_rd890_iommu(iommu->dev)) {
-               pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
-               pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
-               pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
-               pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
-       }
+       if (!is_rd890_iommu(iommu->dev))
+               return;
+
+       /*
+        * Some rd890 systems may not be fully reconfigured by the BIOS, so
+        * it's necessary for us to store this information so it can be
+        * reprogrammed on resume
+        */
+
+       pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                             &iommu->stored_addr_lo);
+       pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+                             &iommu->stored_addr_hi);
+
+       /* Low bit locks writes to configuration space */
+       iommu->stored_addr_lo &= ~1;
+
+       for (i = 0; i < 6; i++)
+               for (j = 0; j < 0x12; j++)
+                       iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+       for (i = 0; i < 0x83; i++)
+               iommu->stored_l2[i] = iommu_read_l2(iommu, i);
 }
 
 /*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
        iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
 }
 
-static void iommu_apply_quirks(struct amd_iommu *iommu)
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
 {
-       if (is_rd890_iommu(iommu->dev)) {
-               pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
-               pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
-               pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
-               pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
-       }
+       int i, j;
+       u32 ioc_feature_control;
+       struct pci_dev *pdev = NULL;
+
+       /* RD890 BIOSes may not have completely reconfigured the iommu */
+       if (!is_rd890_iommu(iommu->dev))
+               return;
+
+       /*
+        * First, we need to ensure that the iommu is enabled. This is
+        * controlled by a register in the northbridge
+        */
+       pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+       if (!pdev)
+               return;
+
+       /* Select Northbridge indirect register 0x75 and enable writing */
+       pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+       pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+       /* Enable the iommu */
+       if (!(ioc_feature_control & 0x1))
+               pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+       pci_dev_put(pdev);
+
+       /* Restore the iommu BAR */
+       pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                              iommu->stored_addr_lo);
+       pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+                              iommu->stored_addr_hi);
+
+       /* Restore the l1 indirect regs for each of the 6 l1s */
+       for (i = 0; i < 6; i++)
+               for (j = 0; j < 0x12; j++)
+                       iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+       /* Restore the l2 indirect regs */
+       for (i = 0; i < 0x83; i++)
+               iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+       /* Lock PCI setup registers */
+       pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                              iommu->stored_addr_lo | 1);
 }
 
 /*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
 
        for_each_iommu(iommu) {
                iommu_disable(iommu);
-               iommu_apply_quirks(iommu);
                iommu_init_flags(iommu);
                iommu_set_device_table(iommu);
                iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
 
 static int amd_iommu_resume(struct sys_device *dev)
 {
+       struct amd_iommu *iommu;
+
+       for_each_iommu(iommu)
+               iommu_apply_resume_quirks(iommu);
+
        /* re-load the hardware */
        enable_iommus();
 
index a2e0caf..c9cb173 100644 (file)
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
                                continue;
 
                        ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-                       aper_enabled = ctl & AMD64_GARTEN;
+                       aper_enabled = ctl & GARTEN;
                        aper_order = (ctl >> 1) & 7;
                        aper_size = (32 * 1024 * 1024) << aper_order;
                        aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
                                continue;
 
                        ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-                       ctl &= ~AMD64_GARTEN;
+                       ctl &= ~GARTEN;
                        write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
                }
        }
@@ -505,8 +505,13 @@ out:
 
        /* Fix up the north bridges */
        for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
-               int bus;
-               int dev_base, dev_limit;
+               int bus, dev_base, dev_limit;
+
+               /*
+                * Don't enable translation yet but enable GART IO and CPU
+                * accesses and set DISTLBWALKPRB since GART table memory is UC.
+                */
+               u32 ctl = DISTLBWALKPRB | aper_order << 1;
 
                bus = bus_dev_ranges[i].bus;
                dev_base = bus_dev_ranges[i].dev_base;
@@ -515,10 +520,7 @@ out:
                        if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
                                continue;
 
-                       /* Don't enable translation yet. That is done later.
-                          Assume this BIOS didn't initialise the GART so
-                          just overwrite all previous bits */
-                       write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+                       write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
                        write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
                }
        }
index 5e97529..39aaee5 100644 (file)
@@ -141,6 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                                address = (low & MASK_BLKPTR_LO) >> 21;
                                if (!address)
                                        break;
+
                                address += MCG_XBLK_ADDR;
                        } else
                                ++address;
@@ -148,12 +149,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                        if (rdmsr_safe(address, &low, &high))
                                break;
 
-                       if (!(high & MASK_VALID_HI)) {
-                               if (block)
-                                       continue;
-                               else
-                                       break;
-                       }
+                       if (!(high & MASK_VALID_HI))
+                               continue;
 
                        if (!(high & MASK_CNTP_HI)  ||
                             (high & MASK_LOCKED_HI))
index d9368ee..169d880 100644 (file)
@@ -216,7 +216,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
                err = sysfs_add_file_to_group(&sys_dev->kobj,
                                              &attr_core_power_limit_count.attr,
                                              thermal_attr_group.name);
-       if (cpu_has(c, X86_FEATURE_PTS))
+       if (cpu_has(c, X86_FEATURE_PTS)) {
                err = sysfs_add_file_to_group(&sys_dev->kobj,
                                              &attr_package_throttle_count.attr,
                                              thermal_attr_group.name);
@@ -224,6 +224,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
                        err = sysfs_add_file_to_group(&sys_dev->kobj,
                                        &attr_package_power_limit_count.attr,
                                        thermal_attr_group.name);
+       }
 
        return err;
 }
index 0f7f130..6015ee1 100644 (file)
@@ -601,7 +601,7 @@ static void gart_fixup_northbridges(struct sys_device *dev)
                 * Don't enable translations just yet.  That is the next
                 * step.  Restore the pre-suspend aperture settings.
                 */
-               pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+               gart_set_size_and_enable(dev, aperture_order);
                pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
        }
 }
index bc5b9b8..8a3f9f6 100644 (file)
@@ -766,7 +766,6 @@ static void init_vmcb(struct vcpu_svm *svm)
 
        control->iopm_base_pa = iopm_base;
        control->msrpm_base_pa = __pa(svm->msrpm);
-       control->tsc_offset = 0;
        control->int_ctl = V_INTR_MASKING_MASK;
 
        init_seg(&save->es);
@@ -902,6 +901,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
        svm->asid_generation = 0;
        init_vmcb(svm);
+       svm->vmcb->control.tsc_offset = 0-native_read_tsc();
 
        err = fx_init(&svm->vcpu);
        if (err)
@@ -3163,8 +3163,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
        sync_lapic_to_cr8(vcpu);
 
        save_host_msrs(vcpu);
-       fs_selector = kvm_read_fs();
-       gs_selector = kvm_read_gs();
+       savesegment(fs, fs_selector);
+       savesegment(gs, gs_selector);
        ldt_selector = kvm_read_ldt();
        svm->vmcb->save.cr2 = vcpu->arch.cr2;
        /* required for live migration with NPT */
@@ -3251,10 +3251,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
        vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
        vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
-       kvm_load_fs(fs_selector);
-       kvm_load_gs(gs_selector);
-       kvm_load_ldt(ldt_selector);
        load_host_msrs(vcpu);
+       loadsegment(fs, fs_selector);
+#ifdef CONFIG_X86_64
+       load_gs_index(gs_selector);
+       wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+       loadsegment(gs, gs_selector);
+#endif
+       kvm_load_ldt(ldt_selector);
 
        reload_tss(vcpu);
 
index 49b25ee..7bddfab 100644 (file)
@@ -803,7 +803,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
         */
        vmx->host_state.ldt_sel = kvm_read_ldt();
        vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
-       vmx->host_state.fs_sel = kvm_read_fs();
+       savesegment(fs, vmx->host_state.fs_sel);
        if (!(vmx->host_state.fs_sel & 7)) {
                vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
                vmx->host_state.fs_reload_needed = 0;
@@ -811,7 +811,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
                vmcs_write16(HOST_FS_SELECTOR, 0);
                vmx->host_state.fs_reload_needed = 1;
        }
-       vmx->host_state.gs_sel = kvm_read_gs();
+       savesegment(gs, vmx->host_state.gs_sel);
        if (!(vmx->host_state.gs_sel & 7))
                vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
        else {
@@ -841,27 +841,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 {
-       unsigned long flags;
-
        if (!vmx->host_state.loaded)
                return;
 
        ++vmx->vcpu.stat.host_state_reload;
        vmx->host_state.loaded = 0;
        if (vmx->host_state.fs_reload_needed)
-               kvm_load_fs(vmx->host_state.fs_sel);
+               loadsegment(fs, vmx->host_state.fs_sel);
        if (vmx->host_state.gs_ldt_reload_needed) {
                kvm_load_ldt(vmx->host_state.ldt_sel);
-               /*
-                * If we have to reload gs, we must take care to
-                * preserve our gs base.
-                */
-               local_irq_save(flags);
-               kvm_load_gs(vmx->host_state.gs_sel);
 #ifdef CONFIG_X86_64
-               wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+               load_gs_index(vmx->host_state.gs_sel);
+               wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+               loadsegment(gs, vmx->host_state.gs_sel);
 #endif
-               local_irq_restore(flags);
        }
        reload_tss();
 #ifdef CONFIG_X86_64
@@ -2589,8 +2583,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
        vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-       vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs());    /* 22.2.4 */
-       vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs());    /* 22.2.4 */
+       vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
+       vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 #ifdef CONFIG_X86_64
        rdmsrl(MSR_FS_BASE, a);
index f9897f7..9c0d0d3 100644 (file)
@@ -420,9 +420,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
                return -1;
        }
 
-       for_each_node_mask(i, nodes_parsed)
-               e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
-                                               nodes[i].end >> PAGE_SHIFT);
+       for (i = 0; i < num_node_memblks; i++)
+               e820_register_active_regions(memblk_nodeid[i],
+                               node_memblk_range[i].start >> PAGE_SHIFT,
+                               node_memblk_range[i].end >> PAGE_SHIFT);
+
        /* for out of order entries in SRAT */
        sort_node_map();
        if (!nodes_cover_memory(nodes)) {
index 82d5882..0c00870 100644 (file)
@@ -426,7 +426,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
        /*
         * fill in all the output members
         */
-       hdr->device_status = status_byte(rq->errors);
+       hdr->device_status = rq->errors & 0xff;
        hdr->transport_status = host_byte(rq->errors);
        hdr->driver_status = driver_byte(rq->errors);
        hdr->info = 0;
index 205b09a..4e11559 100644 (file)
@@ -938,6 +938,7 @@ int elv_register_queue(struct request_queue *q)
                        }
                }
                kobject_uevent(&e->kobj, KOBJ_ADD);
+               e->registered = 1;
        }
        return error;
 }
@@ -947,6 +948,7 @@ static void __elv_unregister_queue(struct elevator_queue *e)
 {
        kobject_uevent(&e->kobj, KOBJ_REMOVE);
        kobject_del(&e->kobj);
+       e->registered = 0;
 }
 
 void elv_unregister_queue(struct request_queue *q)
@@ -1042,11 +1044,13 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 
        spin_unlock_irq(q->queue_lock);
 
-       __elv_unregister_queue(old_elevator);
+       if (old_elevator->registered) {
+               __elv_unregister_queue(old_elevator);
 
-       err = elv_register_queue(q);
-       if (err)
-               goto fail_register;
+               err = elv_register_queue(q);
+               if (err)
+                       goto fail_register;
+       }
 
        /*
         * finally exit old elevator and turn off BYPASS.
index f761960..af308d0 100644 (file)
@@ -204,6 +204,23 @@ static struct dmi_system_id acpi_osi_dmi_table[] __initdata = {
                },
        },
        {
+       /*
+        * There have a NVIF method in MSI GX723 DSDT need call by Nvidia
+        * driver (e.g. nouveau) when user press brightness hotkey.
+        * Currently, nouveau driver didn't do the job and it causes there
+        * have a infinite while loop in DSDT when user press hotkey.
+        * We add MSI GX723's dmi information to this table for workaround
+        * this issue.
+        * Will remove MSI GX723 from the table after nouveau grows support.
+        */
+       .callback = dmi_disable_osi_vista,
+       .ident = "MSI GX723",
+       .matches = {
+                    DMI_MATCH(DMI_SYS_VENDOR, "Micro-Star International"),
+                    DMI_MATCH(DMI_PRODUCT_NAME, "GX723"),
+               },
+       },
+       {
        .callback = dmi_disable_osi_vista,
        .ident = "Sony VGN-NS10J_S",
        .matches = {
index b618f88..bec561c 100644 (file)
@@ -346,4 +346,5 @@ void __init acpi_early_processor_set_pdc(void)
        acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
                            ACPI_UINT32_MAX,
                            early_init_pdc, NULL, NULL, NULL);
+       acpi_get_devices("ACPI0007", early_init_pdc, NULL, NULL);
 }
index ee9ddeb..8cb0347 100644 (file)
@@ -3156,7 +3156,6 @@ static int __devinit ia_init_one(struct pci_dev *pdev,
 {  
        struct atm_dev *dev;  
        IADEV *iadev;  
-        unsigned long flags;
        int ret;
 
        iadev = kzalloc(sizeof(*iadev), GFP_KERNEL);
@@ -3188,19 +3187,14 @@ static int __devinit ia_init_one(struct pci_dev *pdev,
        ia_dev[iadev_count] = iadev;
        _ia_dev[iadev_count] = dev;
        iadev_count++;
-       spin_lock_init(&iadev->misc_lock);
-       /* First fixes first. I don't want to think about this now. */
-       spin_lock_irqsave(&iadev->misc_lock, flags); 
        if (ia_init(dev) || ia_start(dev)) {  
                IF_INIT(printk("IA register failed!\n");)
                iadev_count--;
                ia_dev[iadev_count] = NULL;
                _ia_dev[iadev_count] = NULL;
-               spin_unlock_irqrestore(&iadev->misc_lock, flags); 
                ret = -EINVAL;
                goto err_out_deregister_dev;
        }
-       spin_unlock_irqrestore(&iadev->misc_lock, flags); 
        IF_EVENT(printk("iadev_count = %d\n", iadev_count);)
 
        iadev->next_board = ia_boards;  
index b2cd20f..077735e 100644 (file)
@@ -1022,7 +1022,7 @@ typedef struct iadev_t {
        struct dle_q rx_dle_q;  
        struct free_desc_q *rx_free_desc_qhead;  
        struct sk_buff_head rx_dma_q;  
-        spinlock_t rx_lock, misc_lock;
+       spinlock_t rx_lock;
        struct atm_vcc **rx_open;       /* list of all open VCs */  
         u16 num_rx_desc, rx_buf_sz, rxing;
         u32 rx_pkt_ram, rx_tmp_cnt;
index f916ddf..f46138a 100644 (file)
@@ -444,6 +444,7 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr,
        struct atm_dev *atmdev = container_of(dev, struct atm_dev, class_dev);
        struct solos_card *card = atmdev->dev_data;
        struct sk_buff *skb;
+       unsigned int len;
 
        spin_lock(&card->cli_queue_lock);
        skb = skb_dequeue(&card->cli_queue[SOLOS_CHAN(atmdev)]);
@@ -451,11 +452,12 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr,
        if(skb == NULL)
                return sprintf(buf, "No data.\n");
 
-       memcpy(buf, skb->data, skb->len);
-       dev_dbg(&card->dev->dev, "len: %d\n", skb->len);
+       len = skb->len;
+       memcpy(buf, skb->data, len);
+       dev_dbg(&card->dev->dev, "len: %d\n", len);
 
        kfree_skb(skb);
-       return skb->len;
+       return len;
 }
 
 static int send_command(struct solos_card *card, int dev, const char *buf, size_t size)
index de27768..4b9359a 100644 (file)
@@ -488,4 +488,21 @@ config BLK_DEV_HD
 
          If unsure, say N.
 
+config BLK_DEV_RBD
+       tristate "Rados block device (RBD)"
+       depends on INET && EXPERIMENTAL && BLOCK
+       select CEPH_LIB
+       select LIBCRC32C
+       select CRYPTO_AES
+       select CRYPTO
+       default n
+       help
+         Say Y here if you want include the Rados block device, which stripes
+         a block device over objects stored in the Ceph distributed object
+         store.
+
+         More information at http://ceph.newdream.net/.
+
+         If unsure, say N.
+
 endif # BLK_DEV
index aff5ac9..d7f463d 100644 (file)
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD)      += hd.o
 
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += xen-blkfront.o
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
+obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
 
 swim_mod-objs  := swim.o swim_asm.o
index e9da874..03688c2 100644 (file)
@@ -113,7 +113,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
                        memcpy(buf, dev->bounce_buf+offset, size);
                offset += size;
                flush_kernel_dcache_page(bvec->bv_page);
-               bvec_kunmap_irq(bvec, &flags);
+               bvec_kunmap_irq(buf, &flags);
                i++;
        }
 }
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644 (file)
index 0000000..6ec9d53
--- /dev/null
@@ -0,0 +1,1841 @@
+/*
+   rbd.c -- Export ceph rados objects as a Linux block device
+
+
+   based on drivers/block/osdblk.c:
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+   Instructions for use
+   --------------------
+
+   1) Map a Linux block device to an existing rbd image.
+
+      Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
+
+      $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
+
+      The snapshot name can be "-" or omitted to map the image read/write.
+
+   2) List all active blkdev<->object mappings.
+
+      In this example, we have performed step #1 twice, creating two blkdevs,
+      mapped to two separate rados objects in the rados rbd pool
+
+      $ cat /sys/class/rbd/list
+      #id     major   client_name     pool    name    snap    KB
+      0       254     client4143      rbd     foo     -      1024000
+
+      The columns, in order, are:
+      - blkdev unique id
+      - blkdev assigned major
+      - rados client id
+      - rados pool name
+      - rados block device name
+      - mapped snapshot ("-" if none)
+      - device size in KB
+
+
+   3) Create a snapshot.
+
+      Usage: <blkdev id> <snapname>
+
+      $ echo "0 mysnap" > /sys/class/rbd/snap_create
+
+
+   4) Listing a snapshot.
+
+      $ cat /sys/class/rbd/snaps_list
+      #id     snap    KB
+      0       -       1024000 (*)
+      0       foo     1024000
+
+      The columns, in order, are:
+      - blkdev unique id
+      - snapshot name, '-' means none (active read/write version)
+      - size of device at time of snapshot
+      - the (*) indicates this is the active version
+
+   5) Rollback to snapshot.
+
+      Usage: <blkdev id> <snapname>
+
+      $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
+
+
+   6) Mapping an image using snapshot.
+
+      A snapshot mapping is read-only. This is being done by passing
+      snap=<snapname> to the options when adding a device.
+
+      $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
+
+
+   7) Remove an active blkdev<->rbd image mapping.
+
+      In this example, we remove the mapping with blkdev unique id 1.
+
+      $ echo 1 > /sys/class/rbd/remove
+
+
+   NOTE:  The actual creation and deletion of rados objects is outside the scope
+   of this driver.
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+#include "rbd_types.h"
+
+#define DRV_NAME "rbd"
+#define DRV_NAME_LONG "rbd (rados block device)"
+
+#define RBD_MINORS_PER_MAJOR   256             /* max minors per blkdev */
+
+#define RBD_MAX_MD_NAME_LEN    (96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_POOL_NAME_LEN  64
+#define RBD_MAX_SNAP_NAME_LEN  32
+#define RBD_MAX_OPT_LEN                1024
+
+#define RBD_SNAP_HEAD_NAME     "-"
+
+#define DEV_NAME_LEN           32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+       u64 image_size;
+       char block_name[32];
+       __u8 obj_order;
+       __u8 crypt_type;
+       __u8 comp_type;
+       struct rw_semaphore snap_rwsem;
+       struct ceph_snap_context *snapc;
+       size_t snap_names_len;
+       u64 snap_seq;
+       u32 total_snaps;
+
+       char *snap_names;
+       u64 *snap_sizes;
+};
+
+/*
+ * an instance of the client.  multiple devices may share a client.
+ */
+struct rbd_client {
+       struct ceph_client      *client;
+       struct kref             kref;
+       struct list_head        node;
+};
+
+/*
+ * a single io request
+ */
+struct rbd_request {
+       struct request          *rq;            /* blk layer request */
+       struct bio              *bio;           /* cloned bio */
+       struct page             **pages;        /* list of used pages */
+       u64                     len;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+       int                     id;             /* blkdev unique id */
+
+       int                     major;          /* blkdev assigned major */
+       struct gendisk          *disk;          /* blkdev's gendisk and rq */
+       struct request_queue    *q;
+
+       struct ceph_client      *client;
+       struct rbd_client       *rbd_client;
+
+       char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+       spinlock_t              lock;           /* queue lock */
+
+       struct rbd_image_header header;
+       char                    obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
+       int                     obj_len;
+       char                    obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+       char                    pool_name[RBD_MAX_POOL_NAME_LEN];
+       int                     poolid;
+
+       char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
+       u32 cur_snap;   /* index+1 of current snapshot within snap context
+                          0 - for the head */
+       int read_only;
+
+       struct list_head        node;
+};
+
+static spinlock_t node_lock;      /* protects client get/put */
+
+static struct class *class_rbd;          /* /sys/class/rbd */
+static DEFINE_MUTEX(ctl_mutex);          /* Serialize open/close/setup/teardown */
+static LIST_HEAD(rbd_dev_list);    /* devices */
+static LIST_HEAD(rbd_client_list);      /* clients */
+
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+       struct gendisk *disk = bdev->bd_disk;
+       struct rbd_device *rbd_dev = disk->private_data;
+
+       set_device_ro(bdev, rbd_dev->read_only);
+
+       if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+               return -EROFS;
+
+       return 0;
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+       .owner                  = THIS_MODULE,
+       .open                   = rbd_open,
+};
+
+/*
+ * Initialize an rbd client instance.
+ * We own *opt.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *opt)
+{
+       struct rbd_client *rbdc;
+       int ret = -ENOMEM;
+
+       dout("rbd_client_create\n");
+       rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+       if (!rbdc)
+               goto out_opt;
+
+       kref_init(&rbdc->kref);
+       INIT_LIST_HEAD(&rbdc->node);
+
+       rbdc->client = ceph_create_client(opt, rbdc);
+       if (IS_ERR(rbdc->client))
+               goto out_rbdc;
+       opt = NULL; /* Now rbdc->client is responsible for opt */
+
+       ret = ceph_open_session(rbdc->client);
+       if (ret < 0)
+               goto out_err;
+
+       spin_lock(&node_lock);
+       list_add_tail(&rbdc->node, &rbd_client_list);
+       spin_unlock(&node_lock);
+
+       dout("rbd_client_create created %p\n", rbdc);
+       return rbdc;
+
+out_err:
+       ceph_destroy_client(rbdc->client);
+out_rbdc:
+       kfree(rbdc);
+out_opt:
+       if (opt)
+               ceph_destroy_options(opt);
+       return ERR_PTR(ret);
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.
+ */
+static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+{
+       struct rbd_client *client_node;
+
+       if (opt->flags & CEPH_OPT_NOSHARE)
+               return NULL;
+
+       list_for_each_entry(client_node, &rbd_client_list, node)
+               if (ceph_compare_options(opt, client_node->client) == 0)
+                       return client_node;
+       return NULL;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.
+ */
+static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
+                         char *options)
+{
+       struct rbd_client *rbdc;
+       struct ceph_options *opt;
+       int ret;
+
+       ret = ceph_parse_options(&opt, options, mon_addr,
+                                mon_addr + strlen(mon_addr), NULL, NULL);
+       if (ret < 0)
+               return ret;
+
+       spin_lock(&node_lock);
+       rbdc = __rbd_client_find(opt);
+       if (rbdc) {
+               ceph_destroy_options(opt);
+
+               /* using an existing client */
+               kref_get(&rbdc->kref);
+               rbd_dev->rbd_client = rbdc;
+               rbd_dev->client = rbdc->client;
+               spin_unlock(&node_lock);
+               return 0;
+       }
+       spin_unlock(&node_lock);
+
+       rbdc = rbd_client_create(opt);
+       if (IS_ERR(rbdc))
+               return PTR_ERR(rbdc);
+
+       rbd_dev->rbd_client = rbdc;
+       rbd_dev->client = rbdc->client;
+       return 0;
+}
+
+/*
+ * Destroy ceph client
+ */
+static void rbd_client_release(struct kref *kref)
+{
+       struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+       dout("rbd_release_client %p\n", rbdc);
+       spin_lock(&node_lock);
+       list_del(&rbdc->node);
+       spin_unlock(&node_lock);
+
+       ceph_destroy_client(rbdc->client);
+       kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_device *rbd_dev)
+{
+       kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+       rbd_dev->rbd_client = NULL;
+       rbd_dev->client = NULL;
+}
+
+
+/*
+ * Create a new header structure, translate header format from the on-disk
+ * header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+                                struct rbd_image_header_ondisk *ondisk,
+                                int allocated_snaps,
+                                gfp_t gfp_flags)
+{
+       int i;
+       u32 snap_count = le32_to_cpu(ondisk->snap_count);
+       int ret = -ENOMEM;
+
+       init_rwsem(&header->snap_rwsem);
+
+       header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+       header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
+                               snap_count *
+                                sizeof(struct rbd_image_snap_ondisk),
+                               gfp_flags);
+       if (!header->snapc)
+               return -ENOMEM;
+       if (snap_count) {
+               header->snap_names = kmalloc(header->snap_names_len,
+                                            GFP_KERNEL);
+               if (!header->snap_names)
+                       goto err_snapc;
+               header->snap_sizes = kmalloc(snap_count * sizeof(u64),
+                                            GFP_KERNEL);
+               if (!header->snap_sizes)
+                       goto err_names;
+       } else {
+               header->snap_names = NULL;
+               header->snap_sizes = NULL;
+       }
+       memcpy(header->block_name, ondisk->block_name,
+              sizeof(ondisk->block_name));
+
+       header->image_size = le64_to_cpu(ondisk->image_size);
+       header->obj_order = ondisk->options.order;
+       header->crypt_type = ondisk->options.crypt_type;
+       header->comp_type = ondisk->options.comp_type;
+
+       atomic_set(&header->snapc->nref, 1);
+       header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+       header->snapc->num_snaps = snap_count;
+       header->total_snaps = snap_count;
+
+       if (snap_count &&
+           allocated_snaps == snap_count) {
+               for (i = 0; i < snap_count; i++) {
+                       header->snapc->snaps[i] =
+                               le64_to_cpu(ondisk->snaps[i].id);
+                       header->snap_sizes[i] =
+                               le64_to_cpu(ondisk->snaps[i].image_size);
+               }
+
+               /* copy snapshot names */
+               memcpy(header->snap_names, &ondisk->snaps[i],
+                       header->snap_names_len);
+       }
+
+       return 0;
+
+err_names:
+       kfree(header->snap_names);
+err_snapc:
+       kfree(header->snapc);
+       return ret;
+}
+
+static int snap_index(struct rbd_image_header *header, int snap_num)
+{
+       return header->total_snaps - snap_num;
+}
+
+static u64 cur_snap_id(struct rbd_device *rbd_dev)
+{
+       struct rbd_image_header *header = &rbd_dev->header;
+
+       if (!rbd_dev->cur_snap)
+               return 0;
+
+       return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+}
+
+static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
+                       u64 *seq, u64 *size)
+{
+       int i;
+       char *p = header->snap_names;
+
+       for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+               if (strcmp(snap_name, p) == 0)
+                       break;
+       }
+       if (i == header->total_snaps)
+               return -ENOENT;
+       if (seq)
+               *seq = header->snapc->snaps[i];
+
+       if (size)
+               *size = header->snap_sizes[i];
+
+       return i;
+}
+
+static int rbd_header_set_snap(struct rbd_device *dev,
+                              const char *snap_name,
+                              u64 *size)
+{
+       struct rbd_image_header *header = &dev->header;
+       struct ceph_snap_context *snapc = header->snapc;
+       int ret = -ENOENT;
+
+       down_write(&header->snap_rwsem);
+
+       if (!snap_name ||
+           !*snap_name ||
+           strcmp(snap_name, "-") == 0 ||
+           strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+               if (header->total_snaps)
+                       snapc->seq = header->snap_seq;
+               else
+                       snapc->seq = 0;
+               dev->cur_snap = 0;
+               dev->read_only = 0;
+               if (size)
+                       *size = header->image_size;
+       } else {
+               ret = snap_by_name(header, snap_name, &snapc->seq, size);
+               if (ret < 0)
+                       goto done;
+
+               dev->cur_snap = header->total_snaps - ret;
+               dev->read_only = 1;
+       }
+
+       ret = 0;
+done:
+       up_write(&header->snap_rwsem);
+       return ret;
+}
+
+static void rbd_header_free(struct rbd_image_header *header)
+{
+       kfree(header->snapc);
+       kfree(header->snap_names);
+       kfree(header->snap_sizes);
+}
+
+/*
+ * get the actual striped segment name, offset and length
+ */
+static u64 rbd_get_segment(struct rbd_image_header *header,
+                          const char *block_name,
+                          u64 ofs, u64 len,
+                          char *seg_name, u64 *segofs)
+{
+       u64 seg = ofs >> header->obj_order;
+
+       if (seg_name)
+               snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
+                        "%s.%012llx", block_name, seg);
+
+       ofs = ofs & ((1 << header->obj_order) - 1);
+       len = min_t(u64, len, (1 << header->obj_order) - ofs);
+
+       if (segofs)
+               *segofs = ofs;
+
+       return len;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+       struct bio *tmp;
+
+       while (chain) {
+               tmp = chain;
+               chain = chain->bi_next;
+               bio_put(tmp);
+       }
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+       struct bio_vec *bv;
+       unsigned long flags;
+       void *buf;
+       int i;
+       int pos = 0;
+
+       while (chain) {
+               bio_for_each_segment(bv, chain, i) {
+                       if (pos + bv->bv_len > start_ofs) {
+                               int remainder = max(start_ofs - pos, 0);
+                               buf = bvec_kmap_irq(bv, &flags);
+                               memset(buf + remainder, 0,
+                                      bv->bv_len - remainder);
+                               bvec_kunmap_irq(buf, &flags);
+                       }
+                       pos += bv->bv_len;
+               }
+
+               chain = chain->bi_next;
+       }
+}
+
+/*
+ * bio_chain_clone - clone a chain of bios up to a certain length.
+ * might return a bio_pair that will need to be released.
+ */
+static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
+                                  struct bio_pair **bp,
+                                  int len, gfp_t gfpmask)
+{
+       struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
+       int total = 0;
+
+       if (*bp) {
+               bio_pair_release(*bp);
+               *bp = NULL;
+       }
+
+       while (old_chain && (total < len)) {
+               tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+               if (!tmp)
+                       goto err_out;
+
+               if (total + old_chain->bi_size > len) {
+                       struct bio_pair *bp;
+
+                       /*
+                        * this split can only happen with a single paged bio,
+                        * split_bio will BUG_ON if this is not the case
+                        */
+                       dout("bio_chain_clone split! total=%d remaining=%d"
+                            "bi_size=%d\n",
+                            (int)total, (int)len-total,
+                            (int)old_chain->bi_size);
+
+                       /* split the bio. We'll release it either in the next
+                          call, or it will have to be released outside */
+                       bp = bio_split(old_chain, (len - total) / 512ULL);
+                       if (!bp)
+                               goto err_out;
+
+                       __bio_clone(tmp, &bp->bio1);
+
+                       *next = &bp->bio2;
+               } else {
+                       __bio_clone(tmp, old_chain);
+                       *next = old_chain->bi_next;
+               }
+
+               tmp->bi_bdev = NULL;
+               gfpmask &= ~__GFP_WAIT;
+               tmp->bi_next = NULL;
+
+               if (!new_chain) {
+                       new_chain = tail = tmp;
+               } else {
+                       tail->bi_next = tmp;
+                       tail = tmp;
+               }
+               old_chain = old_chain->bi_next;
+
+               total += tmp->bi_size;
+       }
+
+       BUG_ON(total < len);
+
+       if (tail)
+               tail->bi_next = NULL;
+
+       *old = old_chain;
+
+       return new_chain;
+
+err_out:
+       dout("bio_chain_clone with err\n");
+       bio_chain_put(new_chain);
+       return NULL;
+}
+
+/*
+ * helpers for osd request op vectors.
+ */
+static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
+                           int num_ops,
+                           int opcode,
+                           u32 payload_len)
+{
+       *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
+                      GFP_NOIO);
+       if (!*ops)
+               return -ENOMEM;
+       (*ops)[0].op = opcode;
+       /*
+        * op extent offset and length will be set later on
+        * in calc_raw_layout()
+        */
+       (*ops)[0].payload_len = payload_len;
+       return 0;
+}
+
+static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+{
+       kfree(ops);
+}
+
+/*
+ * Send ceph osd request
+ */
+static int rbd_do_request(struct request *rq,
+                         struct rbd_device *dev,
+                         struct ceph_snap_context *snapc,
+                         u64 snapid,
+                         const char *obj, u64 ofs, u64 len,
+                         struct bio *bio,
+                         struct page **pages,
+                         int num_pages,
+                         int flags,
+                         struct ceph_osd_req_op *ops,
+                         int num_reply,
+                         void (*rbd_cb)(struct ceph_osd_request *req,
+                                        struct ceph_msg *msg))
+{
+       struct ceph_osd_request *req;
+       struct ceph_file_layout *layout;
+       int ret;
+       u64 bno;
+       struct timespec mtime = CURRENT_TIME;
+       struct rbd_request *req_data;
+       struct ceph_osd_request_head *reqhead;
+       struct rbd_image_header *header = &dev->header;
+
+       ret = -ENOMEM;
+       req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
+       if (!req_data)
+               goto done;
+
+       dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
+
+       down_read(&header->snap_rwsem);
+
+       req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
+                                     snapc,
+                                     ops,
+                                     false,
+                                     GFP_NOIO, pages, bio);
+       if (IS_ERR(req)) {
+               up_read(&header->snap_rwsem);
+               ret = PTR_ERR(req);
+               goto done_pages;
+       }
+
+       req->r_callback = rbd_cb;
+
+       req_data->rq = rq;
+       req_data->bio = bio;
+       req_data->pages = pages;
+       req_data->len = len;
+
+       req->r_priv = req_data;
+
+       reqhead = req->r_request->front.iov_base;
+       reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+
+       strncpy(req->r_oid, obj, sizeof(req->r_oid));
+       req->r_oid_len = strlen(req->r_oid);
+
+       layout = &req->r_file_layout;
+       memset(layout, 0, sizeof(*layout));
+       layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+       layout->fl_stripe_count = cpu_to_le32(1);
+       layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+       layout->fl_pg_preferred = cpu_to_le32(-1);
+       layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+       ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
+                            ofs, &len, &bno, req, ops);
+
+       ceph_osdc_build_request(req, ofs, &len,
+                               ops,
+                               snapc,
+                               &mtime,
+                               req->r_oid, req->r_oid_len);
+       up_read(&header->snap_rwsem);
+
+       ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+       if (ret < 0)
+               goto done_err;
+
+       if (!rbd_cb) {
+               ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+               ceph_osdc_put_request(req);
+       }
+       return ret;
+
+done_err:
+       bio_chain_put(req_data->bio);
+       ceph_osdc_put_request(req);
+done_pages:
+       kfree(req_data);
+done:
+       if (rq)
+               blk_end_request(rq, ret, len);
+       return ret;
+}
+
+/*
+ * Ceph osd op callback
+ */
+static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+       struct rbd_request *req_data = req->r_priv;
+       struct ceph_osd_reply_head *replyhead;
+       struct ceph_osd_op *op;
+       __s32 rc;
+       u64 bytes;
+       int read_op;
+
+       /* parse reply */
+       replyhead = msg->front.iov_base;
+       WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+       op = (void *)(replyhead + 1);
+       rc = le32_to_cpu(replyhead->result);
+       bytes = le64_to_cpu(op->extent.length);
+       read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+
+       dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+
+       if (rc == -ENOENT && read_op) {
+               zero_bio_chain(req_data->bio, 0);
+               rc = 0;
+       } else if (rc == 0 && read_op && bytes < req_data->len) {
+               zero_bio_chain(req_data->bio, bytes);
+               bytes = req_data->len;
+       }
+
+       blk_end_request(req_data->rq, rc, bytes);
+
+       if (req_data->bio)
+               bio_chain_put(req_data->bio);
+
+       ceph_osdc_put_request(req);
+       kfree(req_data);
+}
+
+/*
+ * Do a synchronous ceph osd operation
+ */
+static int rbd_req_sync_op(struct rbd_device *dev,
+                          struct ceph_snap_context *snapc,
+                          u64 snapid,
+                          int opcode,
+                          int flags,
+                          struct ceph_osd_req_op *orig_ops,
+                          int num_reply,
+                          const char *obj,
+                          u64 ofs, u64 len,
+                          char *buf)
+{
+       int ret;
+       struct page **pages;
+       int num_pages;
+       struct ceph_osd_req_op *ops = orig_ops;
+       u32 payload_len;
+
+       num_pages = calc_pages_for(ofs , len);
+       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+       if (IS_ERR(pages))
+               return PTR_ERR(pages);
+
+       if (!orig_ops) {
+               payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
+               ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+               if (ret < 0)
+                       goto done;
+
+               if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
+                       ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
+                       if (ret < 0)
+                               goto done_ops;
+               }
+       }
+
+       ret = rbd_do_request(NULL, dev, snapc, snapid,
+                         obj, ofs, len, NULL,
+                         pages, num_pages,
+                         flags,
+                         ops,
+                         2,
+                         NULL);
+       if (ret < 0)
+               goto done_ops;
+
+       if ((flags & CEPH_OSD_FLAG_READ) && buf)
+               ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
+
+done_ops:
+       if (!orig_ops)
+               rbd_destroy_ops(ops);
+done:
+       ceph_release_page_vector(pages, num_pages);
+       return ret;
+}
+
+/*
+ * Do an asynchronous ceph osd operation
+ */
+static int rbd_do_op(struct request *rq,
+                    struct rbd_device *rbd_dev ,
+                    struct ceph_snap_context *snapc,
+                    u64 snapid,
+                    int opcode, int flags, int num_reply,
+                    u64 ofs, u64 len,
+                    struct bio *bio)
+{
+       char *seg_name;
+       u64 seg_ofs;
+       u64 seg_len;
+       int ret;
+       struct ceph_osd_req_op *ops;
+       u32 payload_len;
+
+       seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+       if (!seg_name)
+               return -ENOMEM;
+
+       seg_len = rbd_get_segment(&rbd_dev->header,
+                                 rbd_dev->header.block_name,
+                                 ofs, len,
+                                 seg_name, &seg_ofs);
+
+       payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+
+       ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+       if (ret < 0)
+               goto done;
+
+       /* we've taken care of segment sizes earlier when we
+          cloned the bios. We should never have a segment
+          truncated at this point */
+       BUG_ON(seg_len < len);
+
+       ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
+                            seg_name, seg_ofs, seg_len,
+                            bio,
+                            NULL, 0,
+                            flags,
+                            ops,
+                            num_reply,
+                            rbd_req_cb);
+done:
+       kfree(seg_name);
+       return ret;
+}
+
+/*
+ * Request async osd write
+ */
+static int rbd_req_write(struct request *rq,
+                        struct rbd_device *rbd_dev,
+                        struct ceph_snap_context *snapc,
+                        u64 ofs, u64 len,
+                        struct bio *bio)
+{
+       return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
+                        CEPH_OSD_OP_WRITE,
+                        CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+                        2,
+                        ofs, len, bio);
+}
+
+/*
+ * Request async osd read
+ */
+static int rbd_req_read(struct request *rq,
+                        struct rbd_device *rbd_dev,
+                        u64 snapid,
+                        u64 ofs, u64 len,
+                        struct bio *bio)
+{
+       return rbd_do_op(rq, rbd_dev, NULL,
+                        (snapid ? snapid : CEPH_NOSNAP),
+                        CEPH_OSD_OP_READ,
+                        CEPH_OSD_FLAG_READ,
+                        2,
+                        ofs, len, bio);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_read(struct rbd_device *dev,
+                         struct ceph_snap_context *snapc,
+                         u64 snapid,
+                         const char *obj,
+                         u64 ofs, u64 len,
+                         char *buf)
+{
+       return rbd_req_sync_op(dev, NULL,
+                              (snapid ? snapid : CEPH_NOSNAP),
+                              CEPH_OSD_OP_READ,
+                              CEPH_OSD_FLAG_READ,
+                              NULL,
+                              1, obj, ofs, len, buf);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
+                                    u64 snapid,
+                                    const char *obj)
+{
+       struct ceph_osd_req_op *ops;
+       int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
+       if (ret < 0)
+               return ret;
+
+       ops[0].snap.snapid = snapid;
+
+       ret = rbd_req_sync_op(dev, NULL,
+                              CEPH_NOSNAP,
+                              0,
+                              CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+                              ops,
+                              1, obj, 0, 0, NULL);
+
+       rbd_destroy_ops(ops);
+
+       if (ret < 0)
+               return ret;
+
+       return ret;
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_exec(struct rbd_device *dev,
+                            const char *obj,
+                            const char *cls,
+                            const char *method,
+                            const char *data,
+                            int len)
+{
+       struct ceph_osd_req_op *ops;
+       int cls_len = strlen(cls);
+       int method_len = strlen(method);
+       int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
+                                   cls_len + method_len + len);
+       if (ret < 0)
+               return ret;
+
+       ops[0].cls.class_name = cls;
+       ops[0].cls.class_len = (__u8)cls_len;
+       ops[0].cls.method_name = method;
+       ops[0].cls.method_len = (__u8)method_len;
+       ops[0].cls.argc = 0;
+       ops[0].cls.indata = data;
+       ops[0].cls.indata_len = len;
+
+       ret = rbd_req_sync_op(dev, NULL,
+                              CEPH_NOSNAP,
+                              0,
+                              CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+                              ops,
+                              1, obj, 0, 0, NULL);
+
+       rbd_destroy_ops(ops);
+
+       dout("cls_exec returned %d\n", ret);
+       return ret;
+}
+
+/*
+ * block device queue callback
+ */
+static void rbd_rq_fn(struct request_queue *q)
+{
+       struct rbd_device *rbd_dev = q->queuedata;
+       struct request *rq;
+       struct bio_pair *bp = NULL;
+
+       rq = blk_fetch_request(q);
+
+       while (1) {
+               struct bio *bio;
+               struct bio *rq_bio, *next_bio = NULL;
+               bool do_write;
+               int size, op_size = 0;
+               u64 ofs;
+
+               /* peek at request from block layer */
+               if (!rq)
+                       break;
+
+               dout("fetched request\n");
+
+               /* filter out block requests we don't understand */
+               if ((rq->cmd_type != REQ_TYPE_FS)) {
+                       __blk_end_request_all(rq, 0);
+                       goto next;
+               }
+
+               /* deduce our operation (read, write) */
+               do_write = (rq_data_dir(rq) == WRITE);
+
+               size = blk_rq_bytes(rq);
+               ofs = blk_rq_pos(rq) * 512ULL;
+               rq_bio = rq->bio;
+               if (do_write && rbd_dev->read_only) {
+                       __blk_end_request_all(rq, -EROFS);
+                       goto next;
+               }
+
+               spin_unlock_irq(q->queue_lock);
+
+               dout("%s 0x%x bytes at 0x%llx\n",
+                    do_write ? "write" : "read",
+                    size, blk_rq_pos(rq) * 512ULL);
+
+               do {
+                       /* a bio clone to be passed down to OSD req */
+                       dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+                       op_size = rbd_get_segment(&rbd_dev->header,
+                                                 rbd_dev->header.block_name,
+                                                 ofs, size,
+                                                 NULL, NULL);
+                       bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
+                                             op_size, GFP_ATOMIC);
+                       if (!bio) {
+                               spin_lock_irq(q->queue_lock);
+                               __blk_end_request_all(rq, -ENOMEM);
+                               goto next;
+                       }
+
+                       /* init OSD command: write or read */
+                       if (do_write)
+                               rbd_req_write(rq, rbd_dev,
+                                             rbd_dev->header.snapc,
+                                             ofs,
+                                             op_size, bio);
+                       else
+                               rbd_req_read(rq, rbd_dev,
+                                            cur_snap_id(rbd_dev),
+                                            ofs,
+                                            op_size, bio);
+
+                       size -= op_size;
+                       ofs += op_size;
+
+                       rq_bio = next_bio;
+               } while (size > 0);
+
+               if (bp)
+                       bio_pair_release(bp);
+
+               spin_lock_irq(q->queue_lock);
+next:
+               rq = blk_fetch_request(q);
+       }
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+                         struct bio_vec *bvec)
+{
+       struct rbd_device *rbd_dev = q->queuedata;
+       unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
+       sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+       unsigned int bio_sectors = bmd->bi_size >> 9;
+       int max;
+
+       max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
+                                + bio_sectors)) << 9;
+       if (max < 0)
+               max = 0; /* bio_add cannot handle a negative return */
+       if (max <= bvec->bv_len && bio_sectors == 0)
+               return bvec->bv_len;
+       return max;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+       struct gendisk *disk = rbd_dev->disk;
+
+       if (!disk)
+               return;
+
+       rbd_header_free(&rbd_dev->header);
+
+       if (disk->flags & GENHD_FL_UP)
+               del_gendisk(disk);
+       if (disk->queue)
+               blk_cleanup_queue(disk->queue);
+       put_disk(disk);
+}
+
+/*
+ * reload the ondisk the header 
+ */
+static int rbd_read_header(struct rbd_device *rbd_dev,
+                          struct rbd_image_header *header)
+{
+       ssize_t rc;
+       struct rbd_image_header_ondisk *dh;
+       int snap_count = 0;
+       u64 snap_names_len = 0;
+
+       while (1) {
+               int len = sizeof(*dh) +
+                         snap_count * sizeof(struct rbd_image_snap_ondisk) +
+                         snap_names_len;
+
+               rc = -ENOMEM;
+               dh = kmalloc(len, GFP_KERNEL);
+               if (!dh)
+                       return -ENOMEM;
+
+               rc = rbd_req_sync_read(rbd_dev,
+                                      NULL, CEPH_NOSNAP,
+                                      rbd_dev->obj_md_name,
+                                      0, len,
+                                      (char *)dh);
+               if (rc < 0)
+                       goto out_dh;
+
+               rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
+               if (rc < 0)
+                       goto out_dh;
+
+               if (snap_count != header->total_snaps) {
+                       snap_count = header->total_snaps;
+                       snap_names_len = header->snap_names_len;
+                       rbd_header_free(header);
+                       kfree(dh);
+                       continue;
+               }
+               break;
+       }
+
+out_dh:
+       kfree(dh);
+       return rc;
+}
+
+/*
+ * create a snapshot
+ */
+static int rbd_header_add_snap(struct rbd_device *dev,
+                              const char *snap_name,
+                              gfp_t gfp_flags)
+{
+       int name_len = strlen(snap_name);
+       u64 new_snapid;
+       int ret;
+       void *data, *data_start, *data_end;
+
+       /* we should create a snapshot only if we're pointing at the head */
+       if (dev->cur_snap)
+               return -EINVAL;
+
+       ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
+                                     &new_snapid);
+       dout("created snapid=%lld\n", new_snapid);
+       if (ret < 0)
+               return ret;
+
+       data = kmalloc(name_len + 16, gfp_flags);
+       if (!data)
+               return -ENOMEM;
+
+       data_start = data;
+       data_end = data + name_len + 16;
+
+       ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
+       ceph_encode_64_safe(&data, data_end, new_snapid, bad);
+
+       ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+                               data_start, data - data_start);
+
+       kfree(data_start);
+
+       if (ret < 0)
+               return ret;
+
+       dev->header.snapc->seq =  new_snapid;
+
+       return 0;
+bad:
+       return -ERANGE;
+}
+
+/*
+ * only read the first part of the ondisk header, without the snaps info
+ */
+static int rbd_update_snaps(struct rbd_device *rbd_dev)
+{
+       int ret;
+       struct rbd_image_header h;
+       u64 snap_seq;
+
+       ret = rbd_read_header(rbd_dev, &h);
+       if (ret < 0)
+               return ret;
+
+       down_write(&rbd_dev->header.snap_rwsem);
+
+       snap_seq = rbd_dev->header.snapc->seq;
+
+       kfree(rbd_dev->header.snapc);
+       kfree(rbd_dev->header.snap_names);
+       kfree(rbd_dev->header.snap_sizes);
+
+       rbd_dev->header.total_snaps = h.total_snaps;
+       rbd_dev->header.snapc = h.snapc;
+       rbd_dev->header.snap_names = h.snap_names;
+       rbd_dev->header.snap_sizes = h.snap_sizes;
+       rbd_dev->header.snapc->seq = snap_seq;
+
+       up_write(&rbd_dev->header.snap_rwsem);
+
+       return 0;
+}
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+       struct gendisk *disk;
+       struct request_queue *q;
+       int rc;
+       u64 total_size = 0;
+
+       /* contact OSD, request size info about the object being mapped */
+       rc = rbd_read_header(rbd_dev, &rbd_dev->header);
+       if (rc)
+               return rc;
+
+       rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+       if (rc)
+               return rc;
+
+       /* create gendisk info */
+       rc = -ENOMEM;
+       disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+       if (!disk)
+               goto out;
+
+       sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
+       disk->major = rbd_dev->major;
+       disk->first_minor = 0;
+       disk->fops = &rbd_bd_ops;
+       disk->private_data = rbd_dev;
+
+       /* init rq */
+       rc = -ENOMEM;
+       q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+       if (!q)
+               goto out_disk;
+       blk_queue_merge_bvec(q, rbd_merge_bvec);
+       disk->queue = q;
+
+       q->queuedata = rbd_dev;
+
+       rbd_dev->disk = disk;
+       rbd_dev->q = q;
+
+       /* finally, announce the disk to the world */
+       set_capacity(disk, total_size / 512ULL);
+       add_disk(disk);
+
+       pr_info("%s: added with size 0x%llx\n",
+               disk->disk_name, (unsigned long long)total_size);
+       return 0;
+
+out_disk:
+       put_disk(disk);
+out:
+       return rc;
+}
+
+/********************************************************************
+ * /sys/class/rbd/
+ *                   add       map rados objects to blkdev
+ *                   remove    unmap rados objects
+ *                   list      show mappings
+ *******************************************************************/
+
+static void class_rbd_release(struct class *cls)
+{
+       kfree(cls);
+}
+
+static ssize_t class_rbd_list(struct class *c,
+                             struct class_attribute *attr,
+                             char *data)
+{
+       int n = 0;
+       struct list_head *tmp;
+       int max = PAGE_SIZE;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       n += snprintf(data, max,
+                     "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
+
+       list_for_each(tmp, &rbd_dev_list) {
+               struct rbd_device *rbd_dev;
+
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               n += snprintf(data+n, max-n,
+                             "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
+                             rbd_dev->id,
+                             rbd_dev->major,
+                             ceph_client_id(rbd_dev->client),
+                             rbd_dev->pool_name,
+                             rbd_dev->obj, rbd_dev->snap_name,
+                             rbd_dev->header.image_size >> 10);
+               if (n == max)
+                       break;
+       }
+
+       mutex_unlock(&ctl_mutex);
+       return n;
+}
+
+static ssize_t class_rbd_add(struct class *c,
+                            struct class_attribute *attr,
+                            const char *buf, size_t count)
+{
+       struct ceph_osd_client *osdc;
+       struct rbd_device *rbd_dev;
+       ssize_t rc = -ENOMEM;
+       int irc, new_id = 0;
+       struct list_head *tmp;
+       char *mon_dev_name;
+       char *options;
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+       if (!mon_dev_name)
+               goto err_out_mod;
+
+       options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+       if (!options)
+               goto err_mon_dev;
+
+       /* new rbd_device object */
+       rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+       if (!rbd_dev)
+               goto err_out_opt;
+
+       /* static rbd_device initialization */
+       spin_lock_init(&rbd_dev->lock);
+       INIT_LIST_HEAD(&rbd_dev->node);
+
+       /* generate unique id: find highest unique id, add one */
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       list_for_each(tmp, &rbd_dev_list) {
+               struct rbd_device *rbd_dev;
+
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               if (rbd_dev->id >= new_id)
+                       new_id = rbd_dev->id + 1;
+       }
+
+       rbd_dev->id = new_id;
+
+       /* add to global list */
+       list_add_tail(&rbd_dev->node, &rbd_dev_list);
+
+       /* parse add command */
+       if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
+                  "%" __stringify(RBD_MAX_OPT_LEN) "s "
+                  "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
+                  "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
+                  "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+                  mon_dev_name, options, rbd_dev->pool_name,
+                  rbd_dev->obj, rbd_dev->snap_name) < 4) {
+               rc = -EINVAL;
+               goto err_out_slot;
+       }
+
+       if (rbd_dev->snap_name[0] == 0)
+               rbd_dev->snap_name[0] = '-';
+
+       rbd_dev->obj_len = strlen(rbd_dev->obj);
+       snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
+                rbd_dev->obj, RBD_SUFFIX);
+
+       /* initialize rest of new object */
+       snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
+       rc = rbd_get_client(rbd_dev, mon_dev_name, options);
+       if (rc < 0)
+               goto err_out_slot;
+
+       mutex_unlock(&ctl_mutex);
+
+       /* pick the pool */
+       osdc = &rbd_dev->client->osdc;
+       rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
+       if (rc < 0)
+               goto err_out_client;
+       rbd_dev->poolid = rc;
+
+       /* register our block device */
+       irc = register_blkdev(0, rbd_dev->name);
+       if (irc < 0) {
+               rc = irc;
+               goto err_out_client;
+       }
+       rbd_dev->major = irc;
+
+       /* set up and announce blkdev mapping */
+       rc = rbd_init_disk(rbd_dev);
+       if (rc)
+               goto err_out_blkdev;
+
+       return count;
+
+err_out_blkdev:
+       unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_client:
+       rbd_put_client(rbd_dev);
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+err_out_slot:
+       list_del_init(&rbd_dev->node);
+       mutex_unlock(&ctl_mutex);
+
+       kfree(rbd_dev);
+err_out_opt:
+       kfree(options);
+err_mon_dev:
+       kfree(mon_dev_name);
+err_out_mod:
+       dout("Error adding device %s\n", buf);
+       module_put(THIS_MODULE);
+       return rc;
+}
+
+static struct rbd_device *__rbd_get_dev(unsigned long id)
+{
+       struct list_head *tmp;
+       struct rbd_device *rbd_dev;
+
+       list_for_each(tmp, &rbd_dev_list) {
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               if (rbd_dev->id == id)
+                       return rbd_dev;
+       }
+       return NULL;
+}
+
+static ssize_t class_rbd_remove(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, rc;
+       unsigned long ul;
+
+       rc = strict_strtoul(buf, 10, &ul);
+       if (rc)
+               return rc;
+
+       /* convert to int; abort if we lost anything in the conversion */
+       target_id = (int) ul;
+       if (target_id != ul)
+               return -EINVAL;
+
+       /* remove object from list immediately */
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (rbd_dev)
+               list_del_init(&rbd_dev->node);
+
+       mutex_unlock(&ctl_mutex);
+
+       if (!rbd_dev)
+               return -ENOENT;
+
+       rbd_put_client(rbd_dev);
+
+       /* clean up and free blkdev */
+       rbd_free_disk(rbd_dev);
+       unregister_blkdev(rbd_dev->major, rbd_dev->name);
+       kfree(rbd_dev);
+
+       /* release module ref */
+       module_put(THIS_MODULE);
+
+       return count;
+}
+
+static ssize_t class_rbd_snaps_list(struct class *c,
+                             struct class_attribute *attr,
+                             char *data)
+{
+       struct rbd_device *rbd_dev = NULL;
+       struct list_head *tmp;
+       struct rbd_image_header *header;
+       int i, n = 0, max = PAGE_SIZE;
+       int ret;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       n += snprintf(data, max, "#id\tsnap\tKB\n");
+
+       list_for_each(tmp, &rbd_dev_list) {
+               char *names, *p;
+               struct ceph_snap_context *snapc;
+
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               header = &rbd_dev->header;
+
+               down_read(&header->snap_rwsem);
+
+               names = header->snap_names;
+               snapc = header->snapc;
+
+               n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+                             rbd_dev->id, RBD_SNAP_HEAD_NAME,
+                             header->image_size >> 10,
+                             (!rbd_dev->cur_snap ? " (*)" : ""));
+               if (n == max)
+                       break;
+
+               p = names;
+               for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+                       n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+                             rbd_dev->id, p, header->snap_sizes[i] >> 10,
+                             (rbd_dev->cur_snap &&
+                              (snap_index(header, i) == rbd_dev->cur_snap) ?
+                              " (*)" : ""));
+                       if (n == max)
+                               break;
+               }
+
+               up_read(&header->snap_rwsem);
+       }
+
+
+       ret = n;
+       mutex_unlock(&ctl_mutex);
+       return ret;
+}
+
+static ssize_t class_rbd_snaps_refresh(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, rc;
+       unsigned long ul;
+       int ret = count;
+
+       rc = strict_strtoul(buf, 10, &ul);
+       if (rc)
+               return rc;
+
+       /* convert to int; abort if we lost anything in the conversion */
+       target_id = (int) ul;
+       if (target_id != ul)
+               return -EINVAL;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (!rbd_dev) {
+               ret = -ENOENT;
+               goto done;
+       }
+
+       rc = rbd_update_snaps(rbd_dev);
+       if (rc < 0)
+               ret = rc;
+
+done:
+       mutex_unlock(&ctl_mutex);
+       return ret;
+}
+
+static ssize_t class_rbd_snap_create(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, ret;
+       char *name;
+
+       name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
+       if (!name)
+               return -ENOMEM;
+
+       /* parse snaps add command */
+       if (sscanf(buf, "%d "
+                  "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+                  &target_id,
+                  name) != 2) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (!rbd_dev) {
+               ret = -ENOENT;
+               goto done_unlock;
+       }
+
+       ret = rbd_header_add_snap(rbd_dev,
+                                 name, GFP_KERNEL);
+       if (ret < 0)
+               goto done_unlock;
+
+       ret = rbd_update_snaps(rbd_dev);
+       if (ret < 0)
+               goto done_unlock;
+
+       ret = count;
+done_unlock:
+       mutex_unlock(&ctl_mutex);
+done:
+       kfree(name);
+       return ret;
+}
+
+static ssize_t class_rbd_rollback(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, ret;
+       u64 snapid;
+       char snap_name[RBD_MAX_SNAP_NAME_LEN];
+       u64 cur_ofs;
+       char *seg_name;
+
+       /* parse snaps add command */
+       if (sscanf(buf, "%d "
+                  "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+                  &target_id,
+                  snap_name) != 2) {
+               return -EINVAL;
+       }
+
+       ret = -ENOMEM;
+       seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+       if (!seg_name)
+               return ret;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (!rbd_dev) {
+               ret = -ENOENT;
+               goto done_unlock;
+       }
+
+       ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
+       if (ret < 0)
+               goto done_unlock;
+
+       dout("snapid=%lld\n", snapid);
+
+       cur_ofs = 0;
+       while (cur_ofs < rbd_dev->header.image_size) {
+               cur_ofs += rbd_get_segment(&rbd_dev->header,
+                                          rbd_dev->obj,
+                                          cur_ofs, (u64)-1,
+                                          seg_name, NULL);
+               dout("seg_name=%s\n", seg_name);
+
+               ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
+               if (ret < 0)
+                       pr_warning("could not roll back obj %s err=%d\n",
+                                  seg_name, ret);
+       }
+
+       ret = rbd_update_snaps(rbd_dev);
+       if (ret < 0)
+               goto done_unlock;
+
+       ret = count;
+
+done_unlock:
+       mutex_unlock(&ctl_mutex);
+       kfree(seg_name);
+
+       return ret;
+}
+
+static struct class_attribute class_rbd_attrs[] = {
+       __ATTR(add,             0200, NULL, class_rbd_add),
+       __ATTR(remove,          0200, NULL, class_rbd_remove),
+       __ATTR(list,            0444, class_rbd_list, NULL),
+       __ATTR(snaps_refresh,   0200, NULL, class_rbd_snaps_refresh),
+       __ATTR(snap_create,     0200, NULL, class_rbd_snap_create),
+       __ATTR(snaps_list,      0444, class_rbd_snaps_list, NULL),
+       __ATTR(snap_rollback,   0200, NULL, class_rbd_rollback),
+       __ATTR_NULL
+};
+
+/*
+ * create control files in sysfs
+ * /sys/class/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+       int ret = -ENOMEM;
+
+       class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
+       if (!class_rbd)
+               goto out;
+
+       class_rbd->name = DRV_NAME;
+       class_rbd->owner = THIS_MODULE;
+       class_rbd->class_release = class_rbd_release;
+       class_rbd->class_attrs = class_rbd_attrs;
+
+       ret = class_register(class_rbd);
+       if (ret)
+               goto out_class;
+       return 0;
+
+out_class:
+       kfree(class_rbd);
+       class_rbd = NULL;
+       pr_err(DRV_NAME ": failed to create class rbd\n");
+out:
+       return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+       if (class_rbd)
+               class_destroy(class_rbd);
+       class_rbd = NULL;
+}
+
+int __init rbd_init(void)
+{
+       int rc;
+
+       rc = rbd_sysfs_init();
+       if (rc)
+               return rc;
+       spin_lock_init(&node_lock);
+       pr_info("loaded " DRV_NAME_LONG "\n");
+       return 0;
+}
+
+void __exit rbd_exit(void)
+{
+       rbd_sysfs_cleanup();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_DESCRIPTION("rados block device");
+
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644 (file)
index 0000000..fc6c678
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include <linux/types.h>
+
+/*
+ * rbd image 'foo' consists of objects
+ *   foo.rbd      - image metadata
+ *   foo.00000000
+ *   foo.00000001
+ *   ...          - data
+ */
+
+#define RBD_SUFFIX             ".rbd"
+#define RBD_DIRECTORY           "rbd_directory"
+#define RBD_INFO                "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
+#define RBD_MIN_OBJ_ORDER       16
+#define RBD_MAX_OBJ_ORDER       30
+
+#define RBD_MAX_OBJ_NAME_LEN   96
+#define RBD_MAX_SEG_NAME_LEN   128
+
+#define RBD_COMP_NONE          0
+#define RBD_CRYPT_NONE         0
+
+#define RBD_HEADER_TEXT                "<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE   "RBD"
+#define RBD_HEADER_VERSION     "001.005"
+
+struct rbd_info {
+       __le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_image_snap_ondisk {
+       __le64 id;
+       __le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+       char text[40];
+       char block_name[24];
+       char signature[4];
+       char version[8];
+       struct {
+               __u8 order;
+               __u8 crypt_type;
+               __u8 comp_type;
+               __u8 unused;
+       } __attribute__((packed)) options;
+       __le64 image_size;
+       __le64 snap_seq;
+       __le32 snap_count;
+       __le32 reserved;
+       __le64 snap_names_len;
+       struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
index 2aafafc..8320490 100644 (file)
@@ -2,7 +2,6 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
-#include <linux/smp_lock.h>
 #include <linux/hdreg.h>
 #include <linux/virtio.h>
 #include <linux/virtio_blk.h>
@@ -202,6 +201,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
        struct virtio_blk *vblk = disk->private_data;
        struct request *req;
        struct bio *bio;
+       int err;
 
        bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES,
                           GFP_KERNEL);
@@ -215,11 +215,14 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
        }
 
        req->cmd_type = REQ_TYPE_SPECIAL;
-       return blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+       err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+       blk_put_request(req);
+
+       return err;
 }
 
-static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
-                        unsigned cmd, unsigned long data)
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+                            unsigned int cmd, unsigned long data)
 {
        struct gendisk *disk = bdev->bd_disk;
        struct virtio_blk *vblk = disk->private_data;
@@ -234,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
                              (void __user *)data);
 }
 
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
-                            unsigned int cmd, unsigned long param)
-{
-       int ret;
-
-       lock_kernel();
-       ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
-       unlock_kernel();
-
-       return ret;
-}
-
 /* We provide getgeo only to please some old bootloader/partitioning tools */
 static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 {
index 70312da..564808a 100644 (file)
@@ -199,7 +199,7 @@ static void amd64_cleanup(void)
                struct pci_dev *dev = k8_northbridges[i];
                /* disable gart translation */
                pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
-               tmp &= ~AMD64_GARTEN;
+               tmp &= ~GARTEN;
                pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp);
        }
 }
@@ -313,7 +313,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
        if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order))
                return -1;
 
-       pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1);
+       gart_set_size_and_enable(nb, order);
        pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25);
 
        return 0;
index d2abf51..64255ce 100644 (file)
@@ -984,7 +984,9 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
 
        bridge->driver->cache_flush();
 #ifdef CONFIG_X86
-       set_memory_uc((unsigned long)table, 1 << page_order);
+       if (set_memory_uc((unsigned long)table, 1 << page_order))
+               printk(KERN_WARNING "Could not set GATT table memory to UC!");
+
        bridge->gatt_table = (void *)table;
 #else
        bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
index 05ad4a1..7c41335 100644 (file)
@@ -47,6 +47,16 @@ enum tpm_duration {
 #define TPM_MAX_PROTECTED_ORDINAL 12
 #define TPM_PROTECTED_ORDINAL_MASK 0xFF
 
+/*
+ * Bug workaround - some TPM's don't flush the most
+ * recently changed pcr on suspend, so force the flush
+ * with an extend to the selected _unused_ non-volatile pcr.
+ */
+static int tpm_suspend_pcr;
+module_param_named(suspend_pcr, tpm_suspend_pcr, uint, 0644);
+MODULE_PARM_DESC(suspend_pcr,
+                "PCR to use for dummy writes to faciltate flush on suspend.");
+
 static LIST_HEAD(tpm_chip_list);
 static DEFINE_SPINLOCK(driver_lock);
 static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES);
@@ -1077,18 +1087,6 @@ static struct tpm_input_header savestate_header = {
        .ordinal = TPM_ORD_SAVESTATE
 };
 
-/* Bug workaround - some TPM's don't flush the most
- * recently changed pcr on suspend, so force the flush
- * with an extend to the selected _unused_ non-volatile pcr.
- */
-static int tpm_suspend_pcr;
-static int __init tpm_suspend_setup(char *str)
-{
-       get_option(&str, &tpm_suspend_pcr);
-       return 1;
-}
-__setup("tpm_suspend_pcr=", tpm_suspend_setup);
-
 /*
  * We are about to suspend. Save the TPM state
  * so that it can be restored.
index c810481..6c1b676 100644 (file)
@@ -48,6 +48,9 @@ struct ports_driver_data {
        /* Used for exporting per-port information to debugfs */
        struct dentry *debugfs_dir;
 
+       /* List of all the devices we're handling */
+       struct list_head portdevs;
+
        /* Number of devices this driver is handling */
        unsigned int index;
 
@@ -108,6 +111,9 @@ struct port_buffer {
  * ports for that device (vdev->priv).
  */
 struct ports_device {
+       /* Next portdev in the list, head is in the pdrvdata struct */
+       struct list_head list;
+
        /*
         * Workqueue handlers where we process deferred work after
         * notification
@@ -178,15 +184,21 @@ struct port {
        struct console cons;
 
        /* Each port associates with a separate char device */
-       struct cdev cdev;
+       struct cdev *cdev;
        struct device *dev;
 
+       /* Reference-counting to handle port hot-unplugs and file operations */
+       struct kref kref;
+
        /* A waitqueue for poll() or blocking read operations */
        wait_queue_head_t waitqueue;
 
        /* The 'name' of the port that we expose via sysfs properties */
        char *name;
 
+       /* We can notify apps of host connect / disconnect events via SIGIO */
+       struct fasync_struct *async_queue;
+
        /* The 'id' to identify the port with the Host */
        u32 id;
 
@@ -221,6 +233,41 @@ out:
        return port;
 }
 
+static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev,
+                                                dev_t dev)
+{
+       struct port *port;
+       unsigned long flags;
+
+       spin_lock_irqsave(&portdev->ports_lock, flags);
+       list_for_each_entry(port, &portdev->ports, list)
+               if (port->cdev->dev == dev)
+                       goto out;
+       port = NULL;
+out:
+       spin_unlock_irqrestore(&portdev->ports_lock, flags);
+
+       return port;
+}
+
+static struct port *find_port_by_devt(dev_t dev)
+{
+       struct ports_device *portdev;
+       struct port *port;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pdrvdata_lock, flags);
+       list_for_each_entry(portdev, &pdrvdata.portdevs, list) {
+               port = find_port_by_devt_in_portdev(portdev, dev);
+               if (port)
+                       goto out;
+       }
+       port = NULL;
+out:
+       spin_unlock_irqrestore(&pdrvdata_lock, flags);
+       return port;
+}
+
 static struct port *find_port_by_id(struct ports_device *portdev, u32 id)
 {
        struct port *port;
@@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
 static ssize_t send_control_msg(struct port *port, unsigned int event,
                                unsigned int value)
 {
-       return __send_control_msg(port->portdev, port->id, event, value);
+       /* Did the port get unplugged before userspace closed it? */
+       if (port->portdev)
+               return __send_control_msg(port->portdev, port->id, event, value);
+       return 0;
 }
 
 /* Callers must take the port->outvq_lock */
@@ -459,9 +509,12 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
 
        /*
         * Wait till the host acknowledges it pushed out the data we
-        * sent.  This is done for ports in blocking mode or for data
-        * from the hvc_console; the tty operations are performed with
-        * spinlocks held so we can't sleep here.
+        * sent.  This is done for data from the hvc_console; the tty
+        * operations are performed with spinlocks held so we can't
+        * sleep here.  An alternative would be to copy the data to a
+        * buffer and relax the spinning requirement.  The downside is
+        * we need to kmalloc a GFP_ATOMIC buffer each time the
+        * console driver writes something out.
         */
        while (!virtqueue_get_buf(out_vq, &len))
                cpu_relax();
@@ -522,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count,
 /* The condition that must be true for polling to end */
 static bool will_read_block(struct port *port)
 {
+       if (!port->guest_connected) {
+               /* Port got hot-unplugged. Let's exit. */
+               return false;
+       }
        return !port_has_data(port) && port->host_connected;
 }
 
@@ -572,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
                if (ret < 0)
                        return ret;
        }
+       /* Port got hot-unplugged. */
+       if (!port->guest_connected)
+               return -ENODEV;
        /*
         * We could've received a disconnection message while we were
         * waiting for more data.
@@ -613,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
                if (ret < 0)
                        return ret;
        }
+       /* Port got hot-unplugged. */
+       if (!port->guest_connected)
+               return -ENODEV;
 
        count = min((size_t)(32 * 1024), count);
 
@@ -626,6 +689,14 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
                goto free_buf;
        }
 
+       /*
+        * We now ask send_buf() to not spin for generic ports -- we
+        * can re-use the same code path that non-blocking file
+        * descriptors take for blocking file descriptors since the
+        * wait is already done and we're certain the write will go
+        * through to the host.
+        */
+       nonblock = true;
        ret = send_buf(port, buf, count, nonblock);
 
        if (nonblock && ret > 0)
@@ -645,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
        port = filp->private_data;
        poll_wait(filp, &port->waitqueue, wait);
 
+       if (!port->guest_connected) {
+               /* Port got unplugged */
+               return POLLHUP;
+       }
        ret = 0;
        if (!will_read_block(port))
                ret |= POLLIN | POLLRDNORM;
@@ -656,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
        return ret;
 }
 
+static void remove_port(struct kref *kref);
+
 static int port_fops_release(struct inode *inode, struct file *filp)
 {
        struct port *port;
@@ -676,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp)
        reclaim_consumed_buffers(port);
        spin_unlock_irq(&port->outvq_lock);
 
+       /*
+        * Locks aren't necessary here as a port can't be opened after
+        * unplug, and if a port isn't unplugged, a kref would already
+        * exist for the port.  Plus, taking ports_lock here would
+        * create a dependency on other locks taken by functions
+        * inside remove_port if we're the last holder of the port,
+        * creating many problems.
+        */
+       kref_put(&port->kref, remove_port);
+
        return 0;
 }
 
@@ -683,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp)
 {
        struct cdev *cdev = inode->i_cdev;
        struct port *port;
+       int ret;
 
-       port = container_of(cdev, struct port, cdev);
+       port = find_port_by_devt(cdev->dev);
        filp->private_data = port;
 
+       /* Prevent against a port getting hot-unplugged at the same time */
+       spin_lock_irq(&port->portdev->ports_lock);
+       kref_get(&port->kref);
+       spin_unlock_irq(&port->portdev->ports_lock);
+
        /*
         * Don't allow opening of console port devices -- that's done
         * via /dev/hvc
         */
-       if (is_console_port(port))
-               return -ENXIO;
+       if (is_console_port(port)) {
+               ret = -ENXIO;
+               goto out;
+       }
 
        /* Allow only one process to open a particular port at a time */
        spin_lock_irq(&port->inbuf_lock);
        if (port->guest_connected) {
                spin_unlock_irq(&port->inbuf_lock);
-               return -EMFILE;
+               ret = -EMFILE;
+               goto out;
        }
 
        port->guest_connected = true;
@@ -713,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp)
        reclaim_consumed_buffers(port);
        spin_unlock_irq(&port->outvq_lock);
 
+       nonseekable_open(inode, filp);
+
        /* Notify host of port being opened */
        send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1);
 
        return 0;
+out:
+       kref_put(&port->kref, remove_port);
+       return ret;
+}
+
+static int port_fops_fasync(int fd, struct file *filp, int mode)
+{
+       struct port *port;
+
+       port = filp->private_data;
+       return fasync_helper(fd, filp, mode, &port->async_queue);
 }
 
 /*
@@ -732,6 +841,8 @@ static const struct file_operations port_fops = {
        .write = port_fops_write,
        .poll  = port_fops_poll,
        .release = port_fops_release,
+       .fasync = port_fops_fasync,
+       .llseek = no_llseek,
 };
 
 /*
@@ -990,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock)
        return nr_added_bufs;
 }
 
+static void send_sigio_to_port(struct port *port)
+{
+       if (port->async_queue && port->guest_connected)
+               kill_fasync(&port->async_queue, SIGIO, POLL_OUT);
+}