Merge branch 'upstream' of git://ftp.linux-mips.org/pub/scm/upstream-linus
Linus Torvalds [Tue, 20 Jun 2006 02:07:12 +0000 (19:07 -0700)]
* 'upstream' of git://ftp.linux-mips.org/pub/scm/upstream-linus: (51 commits)
  [MIPS] Make timer interrupt frequency configurable from kconfig.
  [MIPS] Correct HAL2 Kconfig description
  [MIPS] Fix R4K cache macro names
  [MIPS] Add Missing R4K Cache Macros to IP27 & IP32
  [MIPS] Support for the RM9000-based Basler eXcite smart camera platform.
  [MIPS] Support for the R5500-based NEC EMMA2RH Mark-eins board
  [MIPS] Support SNI RM200C SNI in big endian mode and R5000 processors.
  [MIPS] SN: include asm/sn/types.h for nasid_t.
  [MIPS] Random fixes for sb1250
  [MIPS] Fix bcm1480 compile
  [MIPS] Remove support for NEC DDB5476.
  [MIPS] Remove support for NEC DDB5074.
  [MIPS] Cleanup memory managment initialization.
  [MIPS] SN: Declare bridge_pci_ops.
  [MIPS] Remove unused function alloc_pci_controller.
  [MIPS] IP27: Extract pci_ops into separate file.
  [MIPS] IP27: Use symbolic constants instead of magic numbers.
  [MIPS] vr41xx: remove unnecessay items from vr41xx/Kconfig.
  [MIPS] IP27: Cleanup N/M mode configuration.
  [MIPS] IP27: Throw away old unused hacks.
  ...

420 files changed:
Documentation/infiniband/ipoib.txt
Documentation/kernel-parameters.txt
Documentation/networking/README.ipw2200
Documentation/networking/bonding.txt
Documentation/networking/ip-sysctl.txt
Documentation/networking/netdevices.txt
MAINTAINERS
arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c
arch/i386/kernel/cpu/cpufreq/longhaul.c
arch/i386/kernel/cpu/cpufreq/longrun.c
arch/i386/kernel/cpu/cpufreq/powernow-k7.c
arch/i386/kernel/cpu/cpufreq/powernow-k8.c
arch/i386/kernel/cpu/cpufreq/powernow-k8.h
arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
drivers/Kconfig
drivers/Makefile
drivers/acpi/pci_link.c
drivers/block/aoe/aoenet.c
drivers/char/agp/alpha-agp.c
drivers/char/agp/generic.c
drivers/char/agp/intel-agp.c
drivers/char/agp/uninorth-agp.c
drivers/connector/cn_queue.c
drivers/cpufreq/cpufreq.c
drivers/cpufreq/cpufreq_stats.c
drivers/cpufreq/freq_table.c
drivers/dma/Kconfig [new file with mode: 0644]
drivers/dma/Makefile [new file with mode: 0644]
drivers/dma/dmaengine.c [new file with mode: 0644]
drivers/dma/ioatdma.c [new file with mode: 0644]
drivers/dma/ioatdma.h [new file with mode: 0644]
drivers/dma/ioatdma_hw.h [new file with mode: 0644]
drivers/dma/ioatdma_io.h [new file with mode: 0644]
drivers/dma/ioatdma_registers.h [new file with mode: 0644]
drivers/dma/iovlock.c [new file with mode: 0644]
drivers/infiniband/Kconfig
drivers/infiniband/core/Makefile
drivers/infiniband/core/addr.c [new file with mode: 0644]
drivers/infiniband/core/cache.c
drivers/infiniband/core/cm.c
drivers/infiniband/core/cma.c [new file with mode: 0644]
drivers/infiniband/core/fmr_pool.c
drivers/infiniband/core/mad.c
drivers/infiniband/core/mad_priv.h
drivers/infiniband/core/sa_query.c
drivers/infiniband/core/ucm.c
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/core/uverbs_marshall.c [new file with mode: 0644]
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/ipath/ipath_mad.c
drivers/infiniband/hw/mthca/mthca_cmd.c
drivers/infiniband/hw/mthca/mthca_cq.c
drivers/infiniband/hw/mthca/mthca_eq.c
drivers/infiniband/hw/mthca/mthca_mad.c
drivers/infiniband/hw/mthca/mthca_provider.c
drivers/infiniband/hw/mthca/mthca_provider.h
drivers/infiniband/hw/mthca/mthca_qp.c
drivers/infiniband/hw/mthca/mthca_reset.c
drivers/infiniband/hw/mthca/mthca_srq.c
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/infiniband/ulp/srp/ib_srp.c
drivers/infiniband/ulp/srp/ib_srp.h
drivers/media/dvb/dvb-core/dvb_net.c
drivers/net/3c501.c
drivers/net/3c503.c
drivers/net/3c505.c
drivers/net/3c507.c
drivers/net/3c523.c
drivers/net/3c527.c
drivers/net/8139cp.c
drivers/net/8139too.c
drivers/net/Kconfig
drivers/net/Makefile
drivers/net/au1000_eth.c
drivers/net/au1000_eth.h
drivers/net/bnx2.c
drivers/net/bnx2.h
drivers/net/bnx2_fw.h
drivers/net/bonding/bond_main.c
drivers/net/cassini.c
drivers/net/e100.c
drivers/net/e1000/Makefile
drivers/net/e1000/e1000.h
drivers/net/e1000/e1000_ethtool.c
drivers/net/e1000/e1000_hw.c
drivers/net/e1000/e1000_hw.h
drivers/net/e1000/e1000_main.c
drivers/net/e1000/e1000_osdep.h
drivers/net/e1000/e1000_param.c
drivers/net/epic100.c
drivers/net/forcedeth.c
drivers/net/hamradio/6pack.c
drivers/net/hamradio/mkiss.c
drivers/net/hp-plus.c
drivers/net/hp.c
drivers/net/ibmlana.c
drivers/net/ibmlana.h
drivers/net/ibmveth.c
drivers/net/ibmveth.h
drivers/net/ifb.c
drivers/net/irda/Kconfig
drivers/net/irda/Makefile
drivers/net/irda/ali-ircc.c
drivers/net/irda/irda-usb.c
drivers/net/irda/irda-usb.h
drivers/net/irda/mcs7780.c [new file with mode: 0644]
drivers/net/irda/mcs7780.h [new file with mode: 0644]
drivers/net/irda/stir4200.c
drivers/net/irda/vlsi_ir.c
drivers/net/ixgb/Makefile
drivers/net/ixgb/ixgb.h
drivers/net/ixgb/ixgb_ee.c
drivers/net/ixgb/ixgb_ee.h
drivers/net/ixgb/ixgb_ethtool.c
drivers/net/ixgb/ixgb_hw.c
drivers/net/ixgb/ixgb_hw.h
drivers/net/ixgb/ixgb_ids.h
drivers/net/ixgb/ixgb_main.c
drivers/net/ixgb/ixgb_osdep.h
drivers/net/ixgb/ixgb_param.c
drivers/net/mv643xx_eth.c
drivers/net/myri10ge/Makefile [new file with mode: 0644]
drivers/net/myri10ge/myri10ge.c [new file with mode: 0644]
drivers/net/myri10ge/myri10ge_mcp.h [new file with mode: 0644]
drivers/net/myri10ge/myri10ge_mcp_gen_header.h [new file with mode: 0644]
drivers/net/natsemi.c
drivers/net/ne.c
drivers/net/ne2.c
drivers/net/pcmcia/pcnet_cs.c
drivers/net/phy/Kconfig
drivers/net/phy/Makefile
drivers/net/phy/smsc.c [new file with mode: 0644]
drivers/net/ppp_generic.c
drivers/net/r8169.c
drivers/net/s2io-regs.h
drivers/net/s2io.c
drivers/net/s2io.h
drivers/net/sis900.c
drivers/net/sis900.h
drivers/net/skge.c
drivers/net/skge.h
drivers/net/smc-ultra.c
drivers/net/smc-ultra32.c
drivers/net/smc911x.c [new file with mode: 0644]
drivers/net/smc911x.h [new file with mode: 0644]
drivers/net/smc9194.c
drivers/net/smc91x.h
drivers/net/sungem_phy.c
drivers/net/tg3.c
drivers/net/tg3.h
drivers/net/tulip/de2104x.c
drivers/net/tulip/de4x5.c
drivers/net/tulip/de4x5.h
drivers/net/tulip/dmfe.c
drivers/net/tulip/eeprom.c
drivers/net/tulip/interrupt.c
drivers/net/tulip/media.c
drivers/net/tulip/tulip.h
drivers/net/tulip/tulip_core.c
drivers/net/tulip/uli526x.c
drivers/net/tulip/winbond-840.c
drivers/net/tulip/xircom_cb.c
drivers/net/via-velocity.c
drivers/net/via-velocity.h
drivers/net/wan/pci200syn.c
drivers/net/wireless/Kconfig
drivers/net/wireless/Makefile
drivers/net/wireless/airo.c
drivers/net/wireless/bcm43xx/bcm43xx.h
drivers/net/wireless/bcm43xx/bcm43xx_debugfs.c
drivers/net/wireless/bcm43xx/bcm43xx_main.c
drivers/net/wireless/hermes.c
drivers/net/wireless/hermes.h
drivers/net/wireless/hostap/hostap_80211_tx.c
drivers/net/wireless/hostap/hostap_ap.c
drivers/net/wireless/hostap/hostap_cs.c
drivers/net/wireless/hostap/hostap_main.c
drivers/net/wireless/ipw2200.c
drivers/net/wireless/ipw2200.h
drivers/net/wireless/orinoco.c
drivers/net/wireless/orinoco.h
drivers/net/wireless/orinoco_cs.c
drivers/net/wireless/orinoco_nortel.c
drivers/net/wireless/orinoco_pci.c
drivers/net/wireless/orinoco_pci.h [new file with mode: 0644]
drivers/net/wireless/orinoco_plx.c
drivers/net/wireless/orinoco_tmd.c
drivers/net/wireless/spectrum_cs.c
drivers/net/wireless/zd1201.c [moved from drivers/usb/net/zd1201.c with 97% similarity]
drivers/net/wireless/zd1201.h [moved from drivers/usb/net/zd1201.h with 100% similarity]
drivers/pci/pci.c
drivers/s390/net/Makefile
drivers/s390/net/ctcmain.c
drivers/s390/net/ctcmain.h
drivers/s390/net/ctctty.c [deleted file]
drivers/s390/net/ctctty.h [deleted file]
drivers/scsi/libata-core.c
drivers/usb/net/Kconfig
drivers/usb/net/Makefile
drivers/video/Kconfig
drivers/video/intelfb/intelfb.h
drivers/video/intelfb/intelfbdrv.c
drivers/video/intelfb/intelfbhw.c
drivers/video/intelfb/intelfbhw.h
include/linux/console.h
include/linux/dmaengine.h [new file with mode: 0644]
include/linux/igmp.h
include/linux/netdevice.h
include/linux/netfilter/nf_conntrack_common.h
include/linux/netfilter/nfnetlink_conntrack.h
include/linux/netfilter/xt_CONNSECMARK.h [new file with mode: 0644]
include/linux/netfilter/xt_SECMARK.h [new file with mode: 0644]
include/linux/netfilter/xt_quota.h [new file with mode: 0644]
include/linux/netfilter/xt_statistic.h [new file with mode: 0644]
include/linux/netfilter_ipv4/ip_conntrack.h
include/linux/netfilter_ipv4/ip_conntrack_h323.h
include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
include/linux/netfilter_ipv4/ip_conntrack_sip.h [new file with mode: 0644]
include/linux/pci.h
include/linux/pci_ids.h
include/linux/pfkeyv2.h
include/linux/security.h
include/linux/selinux.h
include/linux/skbuff.h
include/linux/sysctl.h
include/linux/tcp.h
include/linux/xfrm.h
include/net/ieee80211.h
include/net/ieee80211softmac.h
include/net/ieee80211softmac_wx.h
include/net/ip.h
include/net/llc_if.h
include/net/netdma.h [new file with mode: 0644]
include/net/netfilter/nf_conntrack.h
include/net/netfilter/nf_conntrack_compat.h
include/net/raw.h
include/net/sctp/sctp.h
include/net/sctp/structs.h
include/net/sock.h
include/net/tcp.h
include/net/xfrm.h
include/rdma/ib_addr.h [new file with mode: 0644]
include/rdma/ib_cache.h
include/rdma/ib_cm.h
include/rdma/ib_marshall.h [new file with mode: 0644]
include/rdma/ib_sa.h
include/rdma/ib_smi.h
include/rdma/ib_user_cm.h
include/rdma/ib_user_sa.h [new file with mode: 0644]
include/rdma/ib_user_verbs.h
include/rdma/ib_verbs.h
include/rdma/rdma_cm.h [new file with mode: 0644]
include/rdma/rdma_cm_ib.h [new file with mode: 0644]
include/scsi/srp.h
kernel/power/main.c
kernel/printk.c
net/Kconfig
net/atm/clip.c
net/bridge/Makefile
net/bridge/br.c
net/bridge/br_device.c
net/bridge/br_forward.c
net/bridge/br_if.c
net/bridge/br_netfilter.c
net/bridge/br_netlink.c [new file with mode: 0644]
net/bridge/br_notify.c
net/bridge/br_private.h
net/bridge/br_stp_if.c
net/core/Makefile
net/core/dev.c
net/core/dev_mcast.c
net/core/ethtool.c
net/core/netpoll.c
net/core/pktgen.c
net/core/skbuff.c
net/core/sock.c
net/core/user_dma.c [new file with mode: 0644]
net/dccp/proto.c
net/decnet/dn_nsp_in.c
net/decnet/dn_route.c
net/ieee80211/ieee80211_crypt_tkip.c
net/ieee80211/ieee80211_rx.c
net/ieee80211/ieee80211_tx.c
net/ieee80211/ieee80211_wx.c
net/ieee80211/softmac/Kconfig
net/ieee80211/softmac/ieee80211softmac_assoc.c
net/ieee80211/softmac/ieee80211softmac_auth.c
net/ieee80211/softmac/ieee80211softmac_event.c
net/ieee80211/softmac/ieee80211softmac_io.c
net/ieee80211/softmac/ieee80211softmac_module.c
net/ieee80211/softmac/ieee80211softmac_priv.h
net/ieee80211/softmac/ieee80211softmac_wx.c
net/ipv4/Kconfig
net/ipv4/Makefile
net/ipv4/ah4.c
net/ipv4/esp4.c
net/ipv4/fib_frontend.c
net/ipv4/icmp.c
net/ipv4/igmp.c
net/ipv4/ip_output.c
net/ipv4/ipcomp.c
net/ipv4/netfilter/Kconfig
net/ipv4/netfilter/Makefile
net/ipv4/netfilter/ip_conntrack_amanda.c
net/ipv4/netfilter/ip_conntrack_core.c
net/ipv4/netfilter/ip_conntrack_ftp.c
net/ipv4/netfilter/ip_conntrack_helper_h323.c
net/ipv4/netfilter/ip_conntrack_helper_h323_types.c
net/ipv4/netfilter/ip_conntrack_netlink.c
net/ipv4/netfilter/ip_conntrack_proto_gre.c
net/ipv4/netfilter/ip_conntrack_proto_icmp.c
net/ipv4/netfilter/ip_conntrack_proto_tcp.c
net/ipv4/netfilter/ip_conntrack_proto_udp.c
net/ipv4/netfilter/ip_conntrack_sip.c [new file with mode: 0644]
net/ipv4/netfilter/ip_conntrack_standalone.c
net/ipv4/netfilter/ip_nat_helper_h323.c
net/ipv4/netfilter/ip_nat_sip.c [new file with mode: 0644]
net/ipv4/netfilter/ip_nat_snmp_basic.c
net/ipv4/netfilter/ipt_CLUSTERIP.c
net/ipv4/netfilter/ipt_REJECT.c
net/ipv4/netfilter/ipt_hashlimit.c
net/ipv4/netfilter/ipt_recent.c
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
net/ipv4/netfilter/nf_conntrack_proto_icmp.c
net/ipv4/raw.c
net/ipv4/sysctl_net_ipv4.c
net/ipv4/tcp.c
net/ipv4/tcp_bic.c
net/ipv4/tcp_compound.c [new file with mode: 0644]
net/ipv4/tcp_cong.c
net/ipv4/tcp_cubic.c
net/ipv4/tcp_highspeed.c
net/ipv4/tcp_htcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_lp.c [new file with mode: 0644]
net/ipv4/tcp_output.c
net/ipv4/tcp_probe.c [new file with mode: 0644]
net/ipv4/tcp_veno.c [new file with mode: 0644]
net/ipv4/tcp_westwood.c
net/ipv4/xfrm4_input.c
net/ipv4/xfrm4_mode_transport.c [new file with mode: 0644]
net/ipv4/xfrm4_mode_tunnel.c [new file with mode: 0644]
net/ipv4/xfrm4_output.c
net/ipv4/xfrm4_policy.c
net/ipv4/xfrm4_state.c
net/ipv6/Kconfig
net/ipv6/Makefile
net/ipv6/addrconf.c
net/ipv6/ah6.c
net/ipv6/esp6.c
net/ipv6/ip6_output.c
net/ipv6/ipcomp6.c
net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
net/ipv6/netfilter/nf_conntrack_reasm.c
net/ipv6/tcp_ipv6.c
net/ipv6/xfrm6_input.c
net/ipv6/xfrm6_mode_transport.c [new file with mode: 0644]
net/ipv6/xfrm6_mode_tunnel.c [new file with mode: 0644]
net/ipv6/xfrm6_output.c
net/ipv6/xfrm6_policy.c
net/ipv6/xfrm6_state.c
net/ipx/ipx_route.c
net/irda/irlmp.c
net/key/af_key.c
net/llc/af_llc.c
net/llc/llc_if.c
net/llc/llc_input.c
net/llc/llc_sap.c
net/netfilter/Kconfig
net/netfilter/Makefile
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_ftp.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_proto_tcp.c
net/netfilter/nf_conntrack_proto_udp.c
net/netfilter/nf_conntrack_standalone.c
net/netfilter/xt_CONNSECMARK.c [new file with mode: 0644]
net/netfilter/xt_SECMARK.c [new file with mode: 0644]
net/netfilter/xt_connmark.c
net/netfilter/xt_dccp.c
net/netfilter/xt_mark.c
net/netfilter/xt_multiport.c
net/netfilter/xt_quota.c [new file with mode: 0644]
net/netfilter/xt_sctp.c
net/netfilter/xt_statistic.c [new file with mode: 0644]
net/netfilter/xt_string.c
net/sched/sch_generic.c
net/sched/sch_teql.c
net/sctp/input.c
net/sctp/ipv6.c
net/sctp/output.c
net/sctp/outqueue.c
net/sctp/protocol.c
net/sctp/sm_statefuns.c
net/sctp/socket.c
net/sctp/ulpevent.c
net/xfrm/xfrm_policy.c
net/xfrm/xfrm_state.c
net/xfrm/xfrm_user.c
security/dummy.c
security/selinux/Kconfig
security/selinux/exports.c
security/selinux/hooks.c
security/selinux/include/av_inherit.h
security/selinux/include/av_perm_to_string.h
security/selinux/include/av_permissions.h
security/selinux/include/class_to_string.h
security/selinux/include/flask.h
security/selinux/include/xfrm.h
security/selinux/selinuxfs.c
security/selinux/xfrm.c

index 5c5a4cc..1870355 100644 (file)
@@ -1,10 +1,10 @@
 IP OVER INFINIBAND
 
   The ib_ipoib driver is an implementation of the IP over InfiniBand
-  protocol as specified by the latest Internet-Drafts issued by the
-  IETF ipoib working group.  It is a "native" implementation in the
-  sense of setting the interface type to ARPHRD_INFINIBAND and the
-  hardware address length to 20 (earlier proprietary implementations
+  protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib
+  working group.  It is a "native" implementation in the sense of
+  setting the interface type to ARPHRD_INFINIBAND and the hardware
+  address length to 20 (earlier proprietary implementations
   masqueraded to the kernel as ethernet interfaces).
 
 Partitions and P_Keys
@@ -53,3 +53,7 @@ References
 
   IETF IP over InfiniBand (ipoib) Working Group
     http://ietf.org/html.charters/ipoib-charter.html
+  Transmission of IP over InfiniBand (IPoIB) (RFC 4391)
+    http://ietf.org/rfc/rfc4391.txt 
+  IP over InfiniBand (IPoIB) Architecture (RFC 4392)
+    http://ietf.org/rfc/rfc4392.txt 
index b3a6187..a9d3a17 100644 (file)
@@ -1402,6 +1402,15 @@ running once the system is up.
                        If enabled at boot time, /selinux/disable can be used
                        later to disable prior to initial policy load.
 
+       selinux_compat_net =
+                       [SELINUX] Set initial selinux_compat_net flag value.
+                        Format: { "0" | "1" }
+                        0 -- use new secmark-based packet controls
+                        1 -- use legacy packet controls
+                        Default value is 0 (preferred).
+                        Value can be changed at runtime via
+                        /selinux/compat_net.
+
        serialnumber    [BUGS=IA-32]
 
        sg_def_reserved_size=   [SCSI]
index acb30c5..4f2a40f 100644 (file)
@@ -14,8 +14,8 @@ Copyright (C) 2004-2006, Intel Corporation
 
 README.ipw2200
 
-Version: 1.0.8
-Date   : October 20, 2005
+Version: 1.1.2
+Date   : March 30, 2006
 
 
 Index
@@ -103,7 +103,7 @@ file.
 
 1.1. Overview of Features
 -----------------------------------------------
-The current release (1.0.8) supports the following features:
+The current release (1.1.2) supports the following features:
 
 + BSS mode (Infrastructure, Managed)
 + IBSS mode (Ad-Hoc)
@@ -247,8 +247,8 @@ and can set the contents via echo.  For example:
 % cat /sys/bus/pci/drivers/ipw2200/debug_level
 
 Will report the current debug level of the driver's logging subsystem 
-(only available if CONFIG_IPW_DEBUG was configured when the driver was 
-built).
+(only available if CONFIG_IPW2200_DEBUG was configured when the driver
+was built).
 
 You can set the debug level via:
 
index 8d8b4e5..afac780 100644 (file)
@@ -1,7 +1,7 @@
 
                Linux Ethernet Bonding Driver HOWTO
 
-               Latest update: 21 June 2005
+               Latest update: 24 April 2006
 
 Initial release : Thomas Davis <tadavis at lbl.gov>
 Corrections, HA extensions : 2000/10/03-15 :
@@ -12,6 +12,8 @@ Corrections, HA extensions : 2000/10/03-15 :
   - Jay Vosburgh <fubar at us dot ibm dot com>
 
 Reorganized and updated Feb 2005 by Jay Vosburgh
+Added Sysfs information: 2006/04/24
+  - Mitch Williams <mitch.a.williams at intel.com>
 
 Introduction
 ============
@@ -38,61 +40,62 @@ Table of Contents
 2. Bonding Driver Options
 
 3. Configuring Bonding Devices
-3.1    Configuration with sysconfig support
-3.1.1          Using DHCP with sysconfig
-3.1.2          Configuring Multiple Bonds with sysconfig
-3.2    Configuration with initscripts support
-3.2.1          Using DHCP with initscripts
-3.2.2          Configuring Multiple Bonds with initscripts
-3.3    Configuring Bonding Manually
+3.1    Configuration with Sysconfig Support
+3.1.1          Using DHCP with Sysconfig
+3.1.2          Configuring Multiple Bonds with Sysconfig
+3.2    Configuration with Initscripts Support
+3.2.1          Using DHCP with Initscripts
+3.2.2          Configuring Multiple Bonds with Initscripts
+3.3    Configuring Bonding Manually with Ifenslave
 3.3.1          Configuring Multiple Bonds Manually
+3.4    Configuring Bonding Manually via Sysfs
 
-5. Querying Bonding Configuration
-5.1    Bonding Configuration
-5.2    Network Configuration
+4. Querying Bonding Configuration
+4.1    Bonding Configuration
+4.2    Network Configuration
 
-6. Switch Configuration
+5. Switch Configuration
 
-7. 802.1q VLAN Support
+6. 802.1q VLAN Support
 
-8. Link Monitoring
-8.1    ARP Monitor Operation
-8.2    Configuring Multiple ARP Targets
-8.3    MII Monitor Operation
+7. Link Monitoring
+7.1    ARP Monitor Operation
+7.2    Configuring Multiple ARP Targets
+7.3    MII Monitor Operation
 
-9. Potential Trouble Sources
-9.1    Adventures in Routing
-9.2    Ethernet Device Renaming
-9.3    Painfully Slow Or No Failed Link Detection By Miimon
+8. Potential Trouble Sources
+8.1    Adventures in Routing
+8.2    Ethernet Device Renaming
+8.3    Painfully Slow Or No Failed Link Detection By Miimon
 
-10. SNMP agents
+9. SNMP agents
 
-11. Promiscuous mode
+10. Promiscuous mode
 
-12. Configuring Bonding for High Availability
-12.1   High Availability in a Single Switch Topology
-12.2   High Availability in a Multiple Switch Topology
-12.2.1         HA Bonding Mode Selection for Multiple Switch Topology
-12.2.2         HA Link Monitoring for Multiple Switch Topology
+11. Configuring Bonding for High Availability
+11.1   High Availability in a Single Switch Topology
+11.2   High Availability in a Multiple Switch Topology
+11.2.1         HA Bonding Mode Selection for Multiple Switch Topology
+11.2.2         HA Link Monitoring for Multiple Switch Topology
 
-13. Configuring Bonding for Maximum Throughput
-13.1   Maximum Throughput in a Single Switch Topology
-13.1.1         MT Bonding Mode Selection for Single Switch Topology
-13.1.2         MT Link Monitoring for Single Switch Topology
-13.2   Maximum Throughput in a Multiple Switch Topology
-13.2.1         MT Bonding Mode Selection for Multiple Switch Topology
-13.2.2         MT Link Monitoring for Multiple Switch Topology
+12. Configuring Bonding for Maximum Throughput
+12.1   Maximum Throughput in a Single Switch Topology
+12.1.1         MT Bonding Mode Selection for Single Switch Topology
+12.1.2         MT Link Monitoring for Single Switch Topology
+12.2   Maximum Throughput in a Multiple Switch Topology
+12.2.1         MT Bonding Mode Selection for Multiple Switch Topology
+12.2.2         MT Link Monitoring for Multiple Switch Topology
 
-14. Switch Behavior Issues
-14.1   Link Establishment and Failover Delays
-14.2   Duplicated Incoming Packets
+13. Switch Behavior Issues
+13.1   Link Establishment and Failover Delays
+13.2   Duplicated Incoming Packets
 
-15. Hardware Specific Considerations
-15.1   IBM BladeCenter
+14. Hardware Specific Considerations
+14.1   IBM BladeCenter
 
-16. Frequently Asked Questions
+15. Frequently Asked Questions
 
-17. Resources and Links
+16. Resources and Links
 
 
 1. Bonding Driver Installation
@@ -156,6 +159,9 @@ you're trying to build it for.  Some distros (e.g., Red Hat from 7.1
 onwards) do not have /usr/include/linux symbolically linked to the
 default kernel source include directory.
 
+SECOND IMPORTANT NOTE:
+       If you plan to configure bonding using sysfs, you do not need
+to use ifenslave.
 
 2. Bonding Driver Options
 =========================
@@ -270,7 +276,7 @@ mode
                In bonding version 2.6.2 or later, when a failover
                occurs in active-backup mode, bonding will issue one
                or more gratuitous ARPs on the newly active slave.
-               One gratutious ARP is issued for the bonding master
+               One gratuitous ARP is issued for the bonding master
                interface and each VLAN interfaces configured above
                it, provided that the interface has at least one IP
                address configured.  Gratuitous ARPs issued for VLAN
@@ -377,7 +383,7 @@ mode
                When a link is reconnected or a new slave joins the
                bond the receive traffic is redistributed among all
                active slaves in the bond by initiating ARP Replies
-               with the selected mac address to each of the
+               with the selected MAC address to each of the
                clients. The updelay parameter (detailed below) must
                be set to a value equal or greater than the switch's
                forwarding delay so that the ARP Replies sent to the
@@ -498,11 +504,12 @@ not exist, and the layer2 policy is the only policy.
 3. Configuring Bonding Devices
 ==============================
 
-       There are, essentially, two methods for configuring bonding:
-with support from the distro's network initialization scripts, and
-without.  Distros generally use one of two packages for the network
-initialization scripts: initscripts or sysconfig.  Recent versions of
-these packages have support for bonding, while older versions do not.
+       You can configure bonding using either your distro's network
+initialization scripts, or manually using either ifenslave or the
+sysfs interface.  Distros generally use one of two packages for the
+network initialization scripts: initscripts or sysconfig.  Recent
+versions of these packages have support for bonding, while older
+versions do not.
 
        We will first describe the options for configuring bonding for
 distros using versions of initscripts and sysconfig with full or
@@ -530,7 +537,7 @@ $ grep ifenslave /sbin/ifup
        If this returns any matches, then your initscripts or
 sysconfig has support for bonding.
 
-3.1 Configuration with sysconfig support
+3.1 Configuration with Sysconfig Support
 ----------------------------------------
 
        This section applies to distros using a version of sysconfig
@@ -538,7 +545,7 @@ with bonding support, for example, SuSE Linux Enterprise Server 9.
 
        SuSE SLES 9's networking configuration system does support
 bonding, however, at this writing, the YaST system configuration
-frontend does not provide any means to work with bonding devices.
+front end does not provide any means to work with bonding devices.
 Bonding devices can be managed by hand, however, as follows.
 
        First, if they have not already been configured, configure the
@@ -660,7 +667,7 @@ format can be found in an example ifcfg template file:
        Note that the template does not document the various BONDING_
 settings described above, but does describe many of the other options.
 
-3.1.1 Using DHCP with sysconfig
+3.1.1 Using DHCP with Sysconfig
 -------------------------------
 
        Under sysconfig, configuring a device with BOOTPROTO='dhcp'
@@ -670,7 +677,7 @@ attempt to obtain the device address from DHCP prior to adding any of
 the slave devices.  Without active slaves, the DHCP requests are not
 sent to the network.
 
-3.1.2 Configuring Multiple Bonds with sysconfig
+3.1.2 Configuring Multiple Bonds with Sysconfig
 -----------------------------------------------
 
        The sysconfig network initialization system is capable of
@@ -685,7 +692,7 @@ ifcfg-bondX files.
 options in the ifcfg-bondX file, it is not necessary to add them to
 the system /etc/modules.conf or /etc/modprobe.conf configuration file.
 
-3.2 Configuration with initscripts support
+3.2 Configuration with Initscripts Support
 ------------------------------------------
 
        This section applies to distros using a version of initscripts
@@ -756,7 +763,7 @@ options for your configuration.
 will restart the networking subsystem and your bond link should be now
 up and running.
 
-3.2.1 Using DHCP with initscripts
+3.2.1 Using DHCP with Initscripts
 ---------------------------------
 
        Recent versions of initscripts (the version supplied with
@@ -768,7 +775,7 @@ above, except replace the line "BOOTPROTO=none" with "BOOTPROTO=dhcp"
 and add a line consisting of "TYPE=Bonding".  Note that the TYPE value
 is case sensitive.
 
-3.2.2 Configuring Multiple Bonds with initscripts
+3.2.2 Configuring Multiple Bonds with Initscripts
 -------------------------------------------------
 
        At this writing, the initscripts package does not directly
@@ -784,8 +791,8 @@ Fedora Core kernels, and has been seen on RHEL 4 as well.  On kernels
 exhibiting this problem, it will be impossible to configure multiple
 bonds with differing parameters.
 
-3.3 Configuring Bonding Manually
---------------------------------
+3.3 Configuring Bonding Manually with Ifenslave
+-----------------------------------------------
 
        This section applies to distros whose network initialization
 scripts (the sysconfig or initscripts package) do not have specific
@@ -889,11 +896,139 @@ install bond1 /sbin/modprobe --ignore-install bonding -o bond1 \
        This may be repeated any number of times, specifying a new and
 unique name in place of bond1 for each subsequent instance.
 
+3.4 Configuring Bonding Manually via Sysfs
+------------------------------------------
+
+       Starting with version 3.0, Channel Bonding may be configured
+via the sysfs interface.  This interface allows dynamic configuration
+of all bonds in the system without unloading the module.  It also
+allows for adding and removing bonds at runtime.  Ifenslave is no
+longer required, though it is still supported.
+
+       Use of the sysfs interface allows you to use multiple bonds
+with different configurations without having to reload the module.
+It also allows you to use multiple, differently configured bonds when
+bonding is compiled into the kernel.
+
+       You must have the sysfs filesystem mounted to configure
+bonding this way.  The examples in this document assume that you
+are using the standard mount point for sysfs, e.g. /sys.  If your
+sysfs filesystem is mounted elsewhere, you will need to adjust the
+example paths accordingly.
+
+Creating and Destroying Bonds
+-----------------------------
+To add a new bond foo:
+# echo +foo > /sys/class/net/bonding_masters
+
+To remove an existing bond bar:
+# echo -bar > /sys/class/net/bonding_masters
+
+To show all existing bonds:
+# cat /sys/class/net/bonding_masters
+
+NOTE: due to 4K size limitation of sysfs files, this list may be
+truncated if you have more than a few hundred bonds.  This is unlikely
+to occur under normal operating conditions.
+
+Adding and Removing Slaves
+--------------------------
+       Interfaces may be enslaved to a bond using the file
+/sys/class/net/<bond>/bonding/slaves.  The semantics for this file
+are the same as for the bonding_masters file.
+
+To enslave interface eth0 to bond bond0:
+# ifconfig bond0 up
+# echo +eth0 > /sys/class/net/bond0/bonding/slaves
+
+To free slave eth0 from bond bond0:
+# echo -eth0 > /sys/class/net/bond0/bonding/slaves
+
+       NOTE: The bond must be up before slaves can be added.  All
+slaves are freed when the interface is brought down.
+
+       When an interface is enslaved to a bond, symlinks between the
+two are created in the sysfs filesystem.  In this case, you would get
+/sys/class/net/bond0/slave_eth0 pointing to /sys/class/net/eth0, and
+/sys/class/net/eth0/master pointing to /sys/class/net/bond0.
+
+       This means that you can tell quickly whether or not an
+interface is enslaved by looking for the master symlink.  Thus:
+# echo -eth0 > /sys/class/net/eth0/master/bonding/slaves
+will free eth0 from whatever bond it is enslaved to, regardless of
+the name of the bond interface.
+
+Changing a Bond's Configuration
+-------------------------------
+       Each bond may be configured individually by manipulating the
+files located in /sys/class/net/<bond name>/bonding
+
+       The names of these files correspond directly with the command-
+line parameters described elsewhere in in this file, and, with the
+exception of arp_ip_target, they accept the same values.  To see the
+current setting, simply cat the appropriate file.
+
+       A few examples will be given here; for specific usage
+guidelines for each parameter, see the appropriate section in this
+document.
+
+To configure bond0 for balance-alb mode:
+# ifconfig bond0 down
+# echo 6 > /sys/class/net/bond0/bonding/mode
+ - or -
+# echo balance-alb > /sys/class/net/bond0/bonding/mode
+       NOTE: The bond interface must be down before the mode can be
+changed.
+
+To enable MII monitoring on bond0 with a 1 second interval:
+# echo 1000 > /sys/class/net/bond0/bonding/miimon
+       NOTE: If ARP monitoring is enabled, it will disabled when MII
+monitoring is enabled, and vice-versa.
+
+To add ARP targets:
+# echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
+# echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target
+       NOTE:  up to 10 target addresses may be specified.
+
+To remove an ARP target:
+# echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
+
+Example Configuration
+---------------------
+       We begin with the same example that is shown in section 3.3,
+executed with sysfs, and without using ifenslave.
+
+       To make a simple bond of two e100 devices (presumed to be eth0
+and eth1), and have it persist across reboots, edit the appropriate
+file (/etc/init.d/boot.local or /etc/rc.d/rc.local), and add the
+following:
+
+modprobe bonding
+modprobe e100
+echo balance-alb > /sys/class/net/bond0/bonding/mode
+ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up
+echo 100 > /sys/class/net/bond0/bonding/miimon
+echo +eth0 > /sys/class/net/bond0/bonding/slaves
+echo +eth1 > /sys/class/net/bond0/bonding/slaves
+
+       To add a second bond, with two e1000 interfaces in
+active-backup mode, using ARP monitoring, add the following lines to
+your init script:
+
+modprobe e1000
+echo +bond1 > /sys/class/net/bonding_masters
+echo active-backup > /sys/class/net/bond1/bonding/mode
+ifconfig bond1 192.168.2.1 netmask 255.255.255.0 up
+echo +192.168.2.100 /sys/class/net/bond1/bonding/arp_ip_target
+echo 2000 > /sys/class/net/bond1/bonding/arp_interval
+echo +eth2 > /sys/class/net/bond1/bonding/slaves
+echo +eth3 > /sys/class/net/bond1/bonding/slaves
+
 
-5. Querying Bonding Configuration 
+4. Querying Bonding Configuration 
 =================================
 
-5.1 Bonding Configuration
+4.1 Bonding Configuration
 -------------------------
 
        Each bonding device has a read-only file residing in the
@@ -923,7 +1058,7 @@ generally as follows:
        The precise format and contents will change depending upon the
 bonding configuration, state, and version of the bonding driver.
 
-5.2 Network configuration
+4.2 Network configuration
 -------------------------
 
        The network configuration can be inspected using the ifconfig
@@ -958,7 +1093,7 @@ eth1      Link encap:Ethernet  HWaddr 00:C0:F0:1F:37:B4
           collisions:0 txqueuelen:100
           Interrupt:9 Base address:0x1400
 
-6. Switch Configuration
+5. Switch Configuration
 =======================
 
        For this section, "switch" refers to whatever system the
@@ -991,7 +1126,7 @@ transmit policy for an EtherChannel group; all three will interoperate
 with another EtherChannel group.
 
 
-7. 802.1q VLAN Support
+6. 802.1q VLAN Support
 ======================
 
        It is possible to configure VLAN devices over a bond interface
@@ -1042,7 +1177,7 @@ underlying device -- i.e. the bonding interface -- to promiscuous
 mode, which might not be what you want.
 
 
-8. Link Monitoring
+7. Link Monitoring
 ==================
 
        The bonding driver at present supports two schemes for
@@ -1053,7 +1188,7 @@ monitor.
 bonding driver itself, it is not possible to enable both ARP and MII
 monitoring simultaneously.
 
-8.1 ARP Monitor Operation
+7.1 ARP Monitor Operation
 -------------------------
 
        The ARP monitor operates as its name suggests: it sends ARP
@@ -1071,7 +1206,7 @@ those slaves will stay down.  If networking monitoring (tcpdump, etc)
 shows the ARP requests and replies on the network, then it may be that
 your device driver is not updating last_rx and trans_start.
 
-8.2 Configuring Multiple ARP Targets
+7.2 Configuring Multiple ARP Targets
 ------------------------------------
 
        While ARP monitoring can be done with just one target, it can
@@ -1094,7 +1229,7 @@ alias bond0 bonding
 options bond0 arp_interval=60 arp_ip_target=192.168.0.100
 
 
-8.3 MII Monitor Operation
+7.3 MII Monitor Operation
 -------------------------
 
        The MII monitor monitors only the carrier state of the local
@@ -1120,14 +1255,14 @@ does not support or had some error in processing both the MII register
 and ethtool requests), then the MII monitor will assume the link is
 up.
 
-9. Potential Sources of Trouble
+8. Potential Sources of Trouble
 ===============================
 
-9.1 Adventures in Routing
+8.1 Adventures in Routing
 -------------------------
 
        When bonding is configured, it is important that the slave
-devices not have routes that supercede routes of the master (or,
+devices not have routes that supersede routes of the master (or,
 generally, not have routes at all).  For example, suppose the bonding
 device bond0 has two slaves, eth0 and eth1, and the routing table is
 as follows:
@@ -1154,11 +1289,11 @@ by the state of the routing table.
 
        The solution here is simply to insure that slaves do not have
 routes of their own, and if for some reason they must, those routes do
-not supercede routes of their master.  This should generally be the
+not supersede routes of their master.  This should generally be the
 case, but unusual configurations or errant manual or automatic static
 route additions may cause trouble.
 
-9.2 Ethernet Device Renaming
+8.2 Ethernet Device Renaming
 ----------------------------
 
        On systems with network configuration scripts that do not
@@ -1207,7 +1342,7 @@ modprobe with --ignore-install to cause the normal action to then take
 place.  Full documentation on this can be found in the modprobe.conf
 and modprobe manual pages.
 
-9.3. Painfully Slow Or No Failed Link Detection By Miimon
+8.3. Painfully Slow Or No Failed Link Detection By Miimon
 ---------------------------------------------------------
 
        By default, bonding enables the use_carrier option, which
@@ -1235,7 +1370,7 @@ carrier state.  It has no way to determine the state of devices on or
 beyond other ports of a switch, or if a switch is refusing to pass
 traffic while still maintaining carrier on.
 
-10. SNMP agents
+9. SNMP agents
 ===============
 
        If running SNMP agents, the bonding driver should be loaded
@@ -1281,7 +1416,7 @@ ifDescr, the association between the IP address and IfIndex remains
 and SNMP functions such as Interface_Scan_Next will report that
 association.
 
-11. Promiscuous mode
+10. Promiscuous mode
 ====================
 
        When running network monitoring tools, e.g., tcpdump, it is
@@ -1308,7 +1443,7 @@ sending to peers that are unassigned or if the load is unbalanced.
 the active slave changes (e.g., due to a link failure), the
 promiscuous setting will be propagated to the new active slave.
 
-12. Configuring Bonding for High Availability
+11. Configuring Bonding for High Availability
 =============================================
 
        High Availability refers to configurations that provide
@@ -1318,7 +1453,7 @@ goal is to provide the maximum availability of network connectivity
 (i.e., the network always works), even though other configurations
 could provide higher throughput.
 
-12.1 High Availability in a Single Switch Topology
+11.1 High Availability in a Single Switch Topology
 --------------------------------------------------
 
        If two hosts (or a host and a single switch) are directly
@@ -1332,7 +1467,7 @@ the load will be rebalanced across the remaining devices.
        See Section 13, "Configuring Bonding for Maximum Throughput"
 for information on configuring bonding with one peer device.
 
-12.2 High Availability in a Multiple Switch Topology
+11.2 High Availability in a Multiple Switch Topology
 ----------------------------------------------------
 
        With multiple switches, the configuration of bonding and the
@@ -1359,7 +1494,7 @@ switches (ISL, or inter switch link), and multiple ports connecting to
 the outside world ("port3" on each switch).  There is no technical
 reason that this could not be extended to a third switch.
 
-12.2.1 HA Bonding Mode Selection for Multiple Switch Topology
+11.2.1 HA Bonding Mode Selection for Multiple Switch Topology
 -------------------------------------------------------------
 
        In a topology such as the example above, the active-backup and
@@ -1381,7 +1516,7 @@ broadcast: This mode is really a special purpose mode, and is suitable
        necessary for some specific one-way traffic to reach both
        independent networks, then the broadcast mode may be suitable.
 
-12.2.2 HA Link Monitoring Selection for Multiple Switch Topology
+11.2.2 HA Link Monitoring Selection for Multiple Switch Topology
 ----------------------------------------------------------------
 
        The choice of link monitoring ultimately depends upon your
@@ -1402,10 +1537,10 @@ regardless of which switch is active, the ARP monitor has a suitable
 target to query.
 
 
-13. Configuring Bonding for Maximum Throughput
+12. Configuring Bonding for Maximum Throughput
 ==============================================
 
-13.1 Maximizing Throughput in a Single Switch Topology
+12.1 Maximizing Throughput in a Single Switch Topology
 ------------------------------------------------------
 
        In a single switch configuration, the best method to maximize
@@ -1476,7 +1611,7 @@ destination to make load balancing decisions.  The behavior of each
 mode is described below.
 
 
-13.1.1 MT Bonding Mode Selection for Single Switch Topology
+12.1.1 MT Bonding Mode Selection for Single Switch Topology
 -----------------------------------------------------------
 
        This configuration is the easiest to set up and to understand,
@@ -1607,7 +1742,7 @@ balance-alb: This mode is everything that balance-tlb is, and more.
        device driver must support changing the hardware address while
        the device is open.
 
-13.1.2 MT Link Monitoring for Single Switch Topology
+12.1.2 MT Link Monitoring for Single Switch Topology
 ----------------------------------------------------
 
        The choice of link monitoring may largely depend upon which
@@ -1616,7 +1751,7 @@ support the use of the ARP monitor, and are thus restricted to using
 the MII monitor (which does not provide as high a level of end to end
 assurance as the ARP monitor).
 
-13.2 Maximum Throughput in a Multiple Switch Topology
+12.2 Maximum Throughput in a Multiple Switch Topology
 -----------------------------------------------------
 
        Multiple switches may be utilized to optimize for throughput
@@ -1651,7 +1786,7 @@ a single 72 port switch.
 can be equipped with an additional network device connected to an
 external network; this host then additionally acts as a gateway.
 
-13.2.1 MT Bonding Mode Selection for Multiple Switch Topology
+12.2.1 MT Bonding Mode Selection for Multiple Switch Topology
 -------------------------------------------------------------
 
        In actual practice, the bonding mode typically employed in
@@ -1664,7 +1799,7 @@ packets has arrived).  When employed in this fashion, the balance-rr
 mode allows individual connections between two hosts to effectively
 utilize greater than one interface's bandwidth.
 
-13.2.2 MT Link Monitoring for Multiple Switch Topology
+12.2.2 MT Link Monitoring for Multiple Switch Topology
 ------------------------------------------------------
 
        Again, in actual practice, the MII monitor is most often used
@@ -1674,10 +1809,10 @@ advantages over the MII monitor are mitigated by the volume of probes
 needed as the number of systems involved grows (remember that each
 host in the network is configured with bonding).
 
-14. Switch Behavior Issues
+13. Switch Behavior Issues
 ==========================
 
-14.1 Link Establishment and Failover Delays
+13.1 Link Establishment and Failover Delays
 -------------------------------------------
 
        Some switches exhibit undesirable behavior with regard to the
@@ -1712,7 +1847,7 @@ switches take a long time to go into backup mode, it may be desirable
 to not activate a backup interface immediately after a link goes down.
 Failover may be delayed via the downdelay bonding module option.
 
-14.2 Duplicated Incoming Packets
+13.2 Duplicated Incoming Packets
 --------------------------------
 
        It is not uncommon to observe a short burst of duplicated
@@ -1751,14 +1886,14 @@ behavior, it can be induced by clearing the MAC forwarding table (on
 most Cisco switches, the privileged command "clear mac address-table
 dynamic" will accomplish this).
 
-15. Hardware Specific Considerations
+14. Hardware Specific Considerations
 ====================================
 
        This section contains additional information for configuring
 bonding on specific hardware platforms, or for interfacing bonding
 with particular switches or other devices.
 
-15.1 IBM BladeCenter
+14.1 IBM BladeCenter
 --------------------
 
        This applies to the JS20 and similar systems.
@@ -1861,7 +1996,7 @@ bonding driver.
 avoid fail-over delay issues when using bonding.
 
        
-16. Frequently Asked Questions
+15. Frequently Asked Questions
 ==============================
 
 1.  Is it SMP safe?
@@ -1925,7 +2060,7 @@ not have special switch requirements, but do need device drivers that
 support specific features (described in the appropriate section under
 module parameters, above).
 
-       In 802.3ad mode, it works with with systems that support IEEE
+       In 802.3ad mode, it works with systems that support IEEE
 802.3ad Dynamic Link Aggregation.  Most managed and many unmanaged
 switches currently available support 802.3ad.
 
index f12007b..d46338a 100644 (file)
@@ -362,6 +362,13 @@ tcp_workaround_signed_windows - BOOLEAN
        not receive a window scaling option from them.
        Default: 0
 
+tcp_slow_start_after_idle - BOOLEAN
+       If set, provide RFC2861 behavior and time out the congestion
+       window after an idle period.  An idle period is defined at
+       the current RTO.  If unset, the congestion window will not
+       be timed out after an idle period.
+       Default: 1
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS
index 3c0a5ba..847cedb 100644 (file)
@@ -42,9 +42,9 @@ dev->get_stats:
        Context: nominally process, but don't sleep inside an rwlock
 
 dev->hard_start_xmit:
-       Synchronization: dev->xmit_lock spinlock.
+       Synchronization: netif_tx_lock spinlock.
        When the driver sets NETIF_F_LLTX in dev->features this will be
-       called without holding xmit_lock. In this case the driver 
+       called without holding netif_tx_lock. In this case the driver
        has to lock by itself when needed. It is recommended to use a try lock
        for this and return -1 when the spin lock fails. 
        The locking there should also properly protect against 
@@ -62,12 +62,12 @@ dev->hard_start_xmit:
          Only valid when NETIF_F_LLTX is set.
 
 dev->tx_timeout:
-       Synchronization: dev->xmit_lock spinlock.
+       Synchronization: netif_tx_lock spinlock.
        Context: BHs disabled
        Notes: netif_queue_stopped() is guaranteed true
 
 dev->set_multicast_list:
-       Synchronization: dev->xmit_lock spinlock.
+       Synchronization: netif_tx_lock spinlock.
        Context: BHs disabled
 
 dev->poll:
index c3c5842..1421f74 100644 (file)
@@ -1425,6 +1425,8 @@ P:        Jesse Brandeburg
 M:     jesse.brandeburg@intel.com
 P:     Jeff Kirsher
 M:     jeffrey.t.kirsher@intel.com
+P:     Auke Kok
+M:     auke-jan.h.kok@intel.com
 W:     http://sourceforge.net/projects/e1000/
 S:     Supported
 
@@ -1437,6 +1439,8 @@ P:        Jesse Brandeburg
 M:     jesse.brandeburg@intel.com
 P:     Jeff Kirsher
 M:     jeffrey.t.kirsher@intel.com
+P:     Auke Kok
+M:     auke-jan.h.kok@intel.com
 W:     http://sourceforge.net/projects/e1000/
 S:     Supported
 
@@ -1449,6 +1453,8 @@ P:        John Ronciak
 M:     john.ronciak@intel.com
 P:     Jesse Brandeburg
 M:     jesse.brandeburg@intel.com
+P:     Auke Kok
+M:     auke-jan.h.kok@intel.com
 W:     http://sourceforge.net/projects/e1000/
 S:     Supported
 
index 3852d0a..1a7bdce 100644 (file)
@@ -104,7 +104,7 @@ acpi_processor_set_performance (
 {
        u16                     port = 0;
        u8                      bit_width = 0;
-       int                     ret = 0;
+       int                     ret;
        u32                     value = 0;
        int                     i = 0;
        struct cpufreq_freqs    cpufreq_freqs;
@@ -195,7 +195,6 @@ acpi_processor_set_performance (
                        udelay(10);
                }
        } else {
-               i = 0;
                value = (u32) data->acpi_data.states[state].status;
        }
 
@@ -444,14 +443,15 @@ static struct freq_attr* acpi_cpufreq_attr[] = {
 };
 
 static struct cpufreq_driver acpi_cpufreq_driver = {
-       .verify         = acpi_cpufreq_verify,
-       .target         = acpi_cpufreq_target,
-       .init           = acpi_cpufreq_cpu_init,
-       .exit           = acpi_cpufreq_cpu_exit,
-       .resume         = acpi_cpufreq_resume,
-       .name           = "acpi-cpufreq",
-       .owner          = THIS_MODULE,
-       .attr           = acpi_cpufreq_attr,
+       .verify = acpi_cpufreq_verify,
+       .target = acpi_cpufreq_target,
+       .init   = acpi_cpufreq_cpu_init,
+       .exit   = acpi_cpufreq_cpu_exit,
+       .resume = acpi_cpufreq_resume,
+       .name   = "acpi-cpufreq",
+       .owner  = THIS_MODULE,
+       .attr   = acpi_cpufreq_attr,
+       .flags  = CPUFREQ_STICKY,
 };
 
 
index f275e0d..0d49d73 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * (C) 2004  Sebastian Witt <se.witt@gmx.net>
+ * (C) 2004-2006  Sebastian Witt <se.witt@gmx.net>
  *
  *  Licensed under the terms of the GNU GPL License version 2.
  *  Based upon reverse engineered information
@@ -90,7 +90,7 @@ static int nforce2_calc_pll(unsigned int fsb)
 
        /* Try to calculate multiplier and divider up to 4 times */
        while (((mul == 0) || (div == 0)) && (tried <= 3)) {
-               for (xdiv = 1; xdiv <= 0x80; xdiv++)
+               for (xdiv = 2; xdiv <= 0x80; xdiv++)
                        for (xmul = 1; xmul <= 0xfe; xmul++)
                                if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
                                    fsb + tried) {
@@ -117,8 +117,7 @@ static void nforce2_write_pll(int pll)
        int temp;
 
        /* Set the pll addr. to 0x00 */
-       temp = 0x00;
-       pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLADR, temp);
+       pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLADR, 0);
 
        /* Now write the value in all 64 registers */
        for (temp = 0; temp <= 0x3f; temp++)
@@ -266,7 +265,7 @@ static int nforce2_target(struct cpufreq_policy *policy,
        if (freqs.old == freqs.new)
                return 0;
 
-       dprintk(KERN_INFO "cpufreq: Old CPU frequency %d kHz, new %d kHz\n",
+       dprintk("Old CPU frequency %d kHz, new %d kHz\n",
               freqs.old, freqs.new);
 
        cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
@@ -278,7 +277,7 @@ static int nforce2_target(struct cpufreq_policy *policy,
                printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
                        target_fsb);
        else
-               dprintk(KERN_INFO "cpufreq: Changed FSB successfully to %d\n",
+               dprintk("Changed FSB successfully to %d\n",
                        target_fsb);
 
        /* Enable IRQs */
index 8ef3854..146f607 100644 (file)
@@ -77,13 +77,17 @@ static char speedbuffer[8];
 
 static char *print_speed(int speed)
 {
-       if (speed > 1000) {
-               if (speed%1000 == 0)
-                       sprintf (speedbuffer, "%dGHz", speed/1000);
-               else
-                       sprintf (speedbuffer, "%d.%dGHz", speed/1000, (speed%1000)/100);
-       } else
-               sprintf (speedbuffer, "%dMHz", speed);
+       if (speed < 1000) {
+               snprintf(speedbuffer, sizeof(speedbuffer),"%dMHz", speed);
+               return speedbuffer;
+       }
+
+       if (speed%1000 == 0)
+               snprintf(speedbuffer, sizeof(speedbuffer),
+                       "%dGHz", speed/1000);
+       else
+               snprintf(speedbuffer, sizeof(speedbuffer),
+                       "%d.%dGHz", speed/1000, (speed%1000)/100);
 
        return speedbuffer;
 }
@@ -675,7 +679,7 @@ static int __init longhaul_init(void)
 
 static void __exit longhaul_exit(void)
 {
-       int i=0;
+       int i;
 
        for (i=0; i < numscales; i++) {
                if (clock_ratio[i] == maxmult) {
index e3868de..b268951 100644 (file)
@@ -223,7 +223,6 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
                /* set to 0 to try_hi perf_pctg */
                msr_lo &= 0xFFFFFF80;
                msr_hi &= 0xFFFFFF80;
-               msr_lo |= 0;
                msr_hi |= try_hi;
                wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
 
index 2bf4237..694d479 100644 (file)
@@ -452,23 +452,23 @@ static int powernow_decode_bios (int maxfid, int startvid)
 
                        pst = (struct pst_s *) p;
 
-                       for (i = 0 ; i <psb->numpst; i++) {
+                       for (j=0; j<psb->numpst; j++) {
                                pst = (struct pst_s *) p;
                                number_scales = pst->numpstates;
 
                                if ((etuple == pst->cpuid) && check_fsb(pst->fsbspeed) &&
                                    (maxfid==pst->maxfid) && (startvid==pst->startvid))
                                {
-                                       dprintk ("PST:%d (@%p)\n", i, pst);
+                                       dprintk ("PST:%d (@%p)\n", j, pst);
                                        dprintk (" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
                                                 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
 
                                        ret = get_ranges ((char *) pst + sizeof (struct pst_s));
                                        return ret;
-
                                } else {
+                                       unsigned int k;
                                        p = (char *) pst + sizeof (struct pst_s);
-                                       for (j=0 ; j < number_scales; j++)
+                                       for (k=0; k<number_scales; k++)
                                                p+=2;
                                }
                        }
@@ -581,10 +581,7 @@ static int __init powernow_cpu_init (struct cpufreq_policy *policy)
 
        rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val);
 
-       /* recalibrate cpu_khz */
-       result = recalibrate_cpu_khz();
-       if (result)
-               return result;
+       recalibrate_cpu_khz();
 
        fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
        if (!fsb) {
index 71fffa1..b4277f5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- *   (c) 2003, 2004, 2005 Advanced Micro Devices, Inc.
+ *   (c) 2003-2006 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
  *  Based upon datasheets & sample CPUs kindly provided by AMD.
  *
  *  Valuable input gratefully received from Dave Jones, Pavel Machek,
- *  Dominik Brodowski, and others.
+ *  Dominik Brodowski, Jacob Shin, and others.
  *  Originally developed by Paul Devriendt.
  *  Processor information obtained from Chapter 9 (Power and Thermal Management)
  *  of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
  *  Opteron Processors" available for download from www.amd.com
  *
- *  Tables for specific CPUs can be infrerred from
+ *  Tables for specific CPUs can be inferred from
  *     http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
  */
 
@@ -46,7 +46,7 @@
 
 #define PFX "powernow-k8: "
 #define BFX PFX "BIOS error: "
-#define VERSION "version 1.60.2"
+#define VERSION "version 2.00.00"
 #include "powernow-k8.h"
 
 /* serialize freq changes  */
@@ -54,6 +54,8 @@ static DEFINE_MUTEX(fidvid_mutex);
 
 static struct powernow_k8_data *powernow_data[NR_CPUS];
 
+static int cpu_family = CPU_OPTERON;
+
 #ifndef CONFIG_SMP
 static cpumask_t cpu_core_map[1];
 #endif
@@ -64,16 +66,36 @@ static u32 find_freq_from_fid(u32 fid)
        return 800 + (fid * 100);
 }
 
+
 /* Return a frequency in KHz, given an input fid */
 static u32 find_khz_freq_from_fid(u32 fid)
 {
        return 1000 * find_freq_from_fid(fid);
 }
 
-/* Return a voltage in miliVolts, given an input vid */
-static u32 find_millivolts_from_vid(struct powernow_k8_data *data, u32 vid)
+/* Return a frequency in MHz, given an input fid and did */
+static u32 find_freq_from_fiddid(u32 fid, u32 did)
+{
+       return 100 * (fid + 0x10) >> did;
+}
+
+static u32 find_khz_freq_from_fiddid(u32 fid, u32 did)
 {
-       return 1550-vid*25;
+       return 1000 * find_freq_from_fiddid(fid, did);
+}
+
+static u32 find_fid_from_pstate(u32 pstate)
+{
+       u32 hi, lo;
+       rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi);
+       return lo & HW_PSTATE_FID_MASK;
+}
+
+static u32 find_did_from_pstate(u32 pstate)
+{
+       u32 hi, lo;
+       rdmsr(MSR_PSTATE_DEF_BASE + pstate, lo, hi);
+       return (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT;
 }
 
 /* Return the vco fid for an input fid
@@ -98,6 +120,9 @@ static int pending_bit_stuck(void)
 {
        u32 lo, hi;
 
+       if (cpu_family)
+               return 0;
+
        rdmsr(MSR_FIDVID_STATUS, lo, hi);
        return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
 }
@@ -111,6 +136,14 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
        u32 lo, hi;
        u32 i = 0;
 
+       if (cpu_family) {
+               rdmsr(MSR_PSTATE_STATUS, lo, hi);
+               i = lo & HW_PSTATE_MASK;
+               rdmsr(MSR_PSTATE_DEF_BASE + i, lo, hi);
+               data->currfid = lo & HW_PSTATE_FID_MASK;
+               data->currdid = (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT;
+               return 0;
+       }
        do {
                if (i++ > 10000) {
                        dprintk("detected change pending stuck\n");
@@ -175,7 +208,7 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
        do {
                wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
                if (i++ > 100) {
-                       printk(KERN_ERR PFX "internal error - pending bit very stuck - no further pstate changes possible\n");
+                       printk(KERN_ERR PFX "Hardware error - pending bit very stuck - no further pstate changes possible\n");
                        return 1;
                }
        } while (query_current_values_with_pending_wait(data));
@@ -255,7 +288,15 @@ static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid,
        return 0;
 }
 
-/* Change the fid and vid, by the 3 phases. */
+/* Change hardware pstate by single MSR write */
+static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
+{
+       wrmsr(MSR_PSTATE_CTRL, pstate, 0);
+       data->currfid = find_fid_from_pstate(pstate);
+       return 0;
+}
+
+/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
 static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid)
 {
        if (core_voltage_pre_transition(data, reqvid))
@@ -474,26 +515,35 @@ static int check_supported_cpu(unsigned int cpu)
                goto out;
 
        eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-       if ((eax & CPUID_XFAM) != CPUID_XFAM_K8)
+       if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
+           ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
                goto out;
 
-       if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
-           ((eax & CPUID_XMOD) > CPUID_XMOD_REV_G)) {
-               printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax);
-               goto out;
-       }
+       if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
+               if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
+                   ((eax & CPUID_XMOD) > CPUID_XMOD_REV_G)) {
+                       printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax);
+                       goto out;
+               }
 
-       eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
-       if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
-               printk(KERN_INFO PFX
-                      "No frequency change capabilities detected\n");
-               goto out;
-       }
+               eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
+               if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
+                       printk(KERN_INFO PFX
+                              "No frequency change capabilities detected\n");
+                       goto out;
+               }
 
-       cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
-       if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) {
-               printk(KERN_INFO PFX "Power state transitions not supported\n");
-               goto out;
+               cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+               if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) {
+                       printk(KERN_INFO PFX "Power state transitions not supported\n");
+                       goto out;
+               }
+       } else { /* must be a HW Pstate capable processor */
+               cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+               if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
+                       cpu_family = CPU_HW_PSTATE;
+               else
+                       goto out;
        }
 
        rc = 1;
@@ -547,12 +597,18 @@ static void print_basics(struct powernow_k8_data *data)
 {
        int j;
        for (j = 0; j < data->numps; j++) {
-               if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID)
-                       printk(KERN_INFO PFX "   %d : fid 0x%x (%d MHz), vid 0x%x (%d mV)\n", j,
+               if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) {
+                       if (cpu_family) {
+                       printk(KERN_INFO PFX "   %d : fid 0x%x gid 0x%x (%d MHz)\n", j, (data->powernow_table[j].index & 0xff00) >> 8,
+                               (data->powernow_table[j].index & 0xff0000) >> 16,
+                               data->powernow_table[j].frequency/1000);
+                       } else {
+                       printk(KERN_INFO PFX "   %d : fid 0x%x (%d MHz), vid 0x%x\n", j,
                                data->powernow_table[j].index & 0xff,
                                data->powernow_table[j].frequency/1000,
-                               data->powernow_table[j].index >> 8,
-                               find_millivolts_from_vid(data, data->powernow_table[j].index >> 8));
+                               data->powernow_table[j].index >> 8);
+                       }
+               }
        }
        if (data->batps)
                printk(KERN_INFO PFX "Only %d pstates on battery\n", data->batps);
@@ -702,7 +758,7 @@ static int find_psb_table(struct powernow_k8_data *data)
 #ifdef CONFIG_X86_POWERNOW_K8_ACPI
 static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index)
 {
-       if (!data->acpi_data.state_count)
+       if (!data->acpi_data.state_count || cpu_family)
                return;
 
        data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK;
@@ -715,9 +771,8 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
 
 static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 {
-       int i;
-       int cntlofreq = 0;
        struct cpufreq_frequency_table *powernow_table;
+       int ret_val;
 
        if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
                dprintk("register performance failed: bad ACPI data\n");
@@ -746,6 +801,85 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
                goto err_out;
        }
 
+       if (cpu_family)
+               ret_val = fill_powernow_table_pstate(data, powernow_table);
+       else
+               ret_val = fill_powernow_table_fidvid(data, powernow_table);
+       if (ret_val)
+               goto err_out_mem;
+
+       powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END;
+       powernow_table[data->acpi_data.state_count].index = 0;
+       data->powernow_table = powernow_table;
+
+       /* fill in data */
+       data->numps = data->acpi_data.state_count;
+       print_basics(data);
+       powernow_k8_acpi_pst_values(data, 0);
+
+       /* notify BIOS that we exist */
+       acpi_processor_notify_smm(THIS_MODULE);
+
+       return 0;
+
+err_out_mem:
+       kfree(powernow_table);
+
+err_out:
+       acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
+
+       /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
+       data->acpi_data.state_count = 0;
+
+       return -ENODEV;
+}
+
+static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
+{
+       int i;
+
+       for (i = 0; i < data->acpi_data.state_count; i++) {
+               u32 index;
+               u32 hi = 0, lo = 0;
+               u32 fid;
+               u32 did;
+
+               index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
+               if (index > MAX_HW_PSTATE) {
+                       printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index);
+                       printk(KERN_ERR PFX "Please report to BIOS manufacturer\n");
+               }
+               rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
+               if (!(hi & HW_PSTATE_VALID_MASK)) {
+                       dprintk("invalid pstate %d, ignoring\n", index);
+                       powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                       continue;
+               }
+
+               fid = lo & HW_PSTATE_FID_MASK;
+               did = (lo & HW_PSTATE_DID_MASK) >> HW_PSTATE_DID_SHIFT;
+
+               dprintk("   %d : fid 0x%x, did 0x%x\n", index, fid, did);
+
+               powernow_table[i].index = index | (fid << HW_FID_INDEX_SHIFT) | (did << HW_DID_INDEX_SHIFT);
+
+               powernow_table[i].frequency = find_khz_freq_from_fiddid(fid, did);
+
+               if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) {
+                       printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
+                               powernow_table[i].frequency,
+                               (unsigned int) (data->acpi_data.states[i].core_frequency * 1000));
+                       powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+                       continue;
+               }
+       }
+       return 0;
+}
+
+static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
+{
+       int i;
+       int cntlofreq = 0;
        for (i = 0; i < data->acpi_data.state_count; i++) {
                u32 fid;
                u32 vid;
@@ -786,7 +920,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
                                if ((powernow_table[i].frequency != powernow_table[cntlofreq].frequency) ||
                                    (powernow_table[i].index != powernow_table[cntlofreq].index)) {
                                        printk(KERN_ERR PFX "Too many lo freq table entries\n");
-                                       goto err_out_mem;
+                                       return 1;
                                }
 
                                dprintk("double low frequency table entry, ignoring it.\n");
@@ -804,31 +938,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
                        continue;
                }
        }
-
-       powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END;
-       powernow_table[data->acpi_data.state_count].index = 0;
-       data->powernow_table = powernow_table;
-
-       /* fill in data */
-       data->numps = data->acpi_data.state_count;
-       print_basics(data);
-       powernow_k8_acpi_pst_values(data, 0);
-
-       /* notify BIOS that we exist */
-       acpi_processor_notify_smm(THIS_MODULE);
-
        return 0;
-
-err_out_mem:
-       kfree(powernow_table);
-
-err_out:
-       acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
-
-       /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
-       data->acpi_data.state_count = 0;
-
-       return -ENODEV;
 }
 
 static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
@@ -844,20 +954,20 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
 #endif /* CONFIG_X86_POWERNOW_K8_ACPI */
 
 /* Take a frequency, and issue the fid/vid transition command */
-static int transition_frequency(struct powernow_k8_data *data, unsigned int index)
+static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned int index)
 {
-       u32 fid;
-       u32 vid;
+       u32 fid = 0;
+       u32 vid = 0;
        int res, i;
        struct cpufreq_freqs freqs;
 
        dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
 
+       /* fid/vid correctness check for k8 */
        /* fid are the lower 8 bits of the index we stored into
-        * the cpufreq frequency table in find_psb_table, vid are
-        * the upper 8 bits.
+        * the cpufreq frequency table in find_psb_table, vid
+        * are the upper 8 bits.
         */
-
        fid = data->powernow_table[index].index & 0xFF;
        vid = (data->powernow_table[index].index & 0xFF00) >> 8;
 
@@ -881,22 +991,58 @@ static int transition_frequency(struct powernow_k8_data *data, unsigned int inde
 
        dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
                smp_processor_id(), fid, vid);
-
-       freqs.cpu = data->cpu;
        freqs.old = find_khz_freq_from_fid(data->currfid);
        freqs.new = find_khz_freq_from_fid(fid);
-       for_each_cpu_mask(i, cpu_core_map[data->cpu]) {
+
+       for_each_cpu_mask(i, *(data->available_cores)) {
                freqs.cpu = i;
                cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
        }
 
        res = transition_fid_vid(data, fid, vid);
-
        freqs.new = find_khz_freq_from_fid(data->currfid);
-       for_each_cpu_mask(i, cpu_core_map[data->cpu]) {
+
+       for_each_cpu_mask(i, *(data->available_cores)) {
                freqs.cpu = i;
                cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-        }
+       }
+       return res;
+}
+
+/* Take a frequency, and issue the hardware pstate transition command */
+static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned int index)
+{
+       u32 fid = 0;
+       u32 did = 0;
+       u32 pstate = 0;
+       int res, i;
+       struct cpufreq_freqs freqs;
+
+       dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
+
+       /* get fid did for hardware pstate transition */
+       pstate = index & HW_PSTATE_MASK;
+       if (pstate > MAX_HW_PSTATE)
+               return 0;
+       fid = (index & HW_FID_INDEX_MASK) >> HW_FID_INDEX_SHIFT;
+       did = (index & HW_DID_INDEX_MASK) >> HW_DID_INDEX_SHIFT;
+       freqs.old = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+       freqs.new = find_khz_freq_from_fiddid(fid, did);
+
+       for_each_cpu_mask(i, *(data->available_cores)) {
+               freqs.cpu = i;
+               cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+       }
+
+       res = transition_pstate(data, pstate);
+       data->currfid = find_fid_from_pstate(pstate);
+       data->currdid = find_did_from_pstate(pstate);
+       freqs.new = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+
+       for_each_cpu_mask(i, *(data->available_cores)) {
+               freqs.cpu = i;
+               cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+       }
        return res;
 }
 
@@ -933,18 +1079,21 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
        dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
                pol->cpu, targfreq, pol->min, pol->max, relation);
 
-       if (query_current_values_with_pending_wait(data)) {
-               ret = -EIO;
+       if (query_current_values_with_pending_wait(data))
                goto err_out;
-       }
 
-       dprintk("targ: curr fid 0x%x, vid 0x%x\n",
+       if (cpu_family)
+               dprintk("targ: curr fid 0x%x, did 0x%x\n",
+                       data->currfid, data->currvid);
+       else {
+               dprintk("targ: curr fid 0x%x, vid 0x%x\n",
                data->currfid, data->currvid);
 
-       if ((checkvid != data->currvid) || (checkfid != data->currfid)) {
-               printk(KERN_INFO PFX
-                       "error - out of sync, fix 0x%x 0x%x, vid 0x%x 0x%x\n",
-                       checkfid, data->currfid, checkvid, data->currvid);
+               if ((checkvid != data->currvid) || (checkfid != data->currfid)) {
+                       printk(KERN_INFO PFX
+                               "error - out of sync, fix 0x%x 0x%x, vid 0x%x 0x%x\n",
+                               checkfid, data->currfid, checkvid, data->currvid);
+               }
        }
 
        if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate))
@@ -954,7 +1103,11 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
 
        powernow_k8_acpi_pst_values(data, newstate);
 
-       if (transition_frequency(data, newstate)) {
+       if (cpu_family)
+               ret = transition_frequency_pstate(data, newstate);
+       else
+               ret = transition_frequency_fidvid(data, newstate);
+       if (ret) {
                printk(KERN_ERR PFX "transition frequency failed\n");
                ret = 1;
                mutex_unlock(&fidvid_mutex);
@@ -962,7 +1115,10 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
        }
        mutex_unlock(&fidvid_mutex);
 
-       pol->cur = find_khz_freq_from_fid(data->currfid);
+       if (cpu_family)
+               pol->cur = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+       else
+               pol->cur = find_khz_freq_from_fid(data->currfid);
        ret = 0;
 
 err_out:
@@ -1007,14 +1163,13 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
                 * Use the PSB BIOS structure. This is only availabe on
                 * an UP version, and is deprecated by AMD.
                 */
-
                if ((num_online_cpus() != 1) || (num_possible_cpus() != 1)) {
                        printk(KERN_ERR PFX "MP systems not supported by PSB BIOS structure\n");
                        kfree(data);
                        return -ENODEV;
                }
                if (pol->cpu != 0) {
-                       printk(KERN_ERR PFX "init not cpu 0\n");
+                       printk(KERN_ERR PFX "No _PSS objects for CPU other than CPU0\n");
                        kfree(data);
                        return -ENODEV;
                }
@@ -1042,20 +1197,28 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
        if (query_current_values_with_pending_wait(data))
                goto err_out;
 
-       fidvid_msr_init();
+       if (!cpu_family)
+               fidvid_msr_init();
 
        /* run on any CPU again */
        set_cpus_allowed(current, oldmask);
 
        pol->governor = CPUFREQ_DEFAULT_GOVERNOR;
-       pol->cpus = cpu_core_map[pol->cpu];
+       if (cpu_family)
+               pol->cpus = cpumask_of_cpu(pol->cpu);
+       else
+               pol->cpus = cpu_core_map[pol->cpu];
+       data->available_cores = &(pol->cpus);
 
        /* Take a crude guess here.
         * That guess was in microseconds, so multiply with 1000 */
        pol->cpuinfo.transition_latency = (((data->rvo + 8) * data->vstable * VST_UNITS_20US)
            + (3 * (1 << data->irt) * 10)) * 1000;
 
-       pol->cur = find_khz_freq_from_fid(data->currfid);
+       if (cpu_family)
+               pol->cur = find_khz_freq_from_fiddid(data->currfid, data->currdid);
+       else
+               pol->cur = find_khz_freq_from_fid(data->currfid);
        dprintk("policy current frequency %d kHz\n", pol->cur);
 
        /* min/max the cpu is capable of */
@@ -1069,8 +1232,12 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 
        cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
 
-       printk("cpu_init done, current fid 0x%x, vid 0x%x\n",
-              data->currfid, data->currvid);
+       if (cpu_family)
+               dprintk("cpu_init done, current fid 0x%x, did 0x%x\n",
+                       data->currfid, data->currdid);
+       else
+               dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
+                       data->currfid, data->currvid);
 
        powernow_data[pol->cpu] = data;
 
@@ -1156,8 +1323,9 @@ static int __cpuinit powernowk8_init(void)
        }
 
        if (supported_cpus == num_online_cpus()) {
-               printk(KERN_INFO PFX "Found %d AMD Athlon 64 / Opteron "
-                       "processors (" VERSION ")\n", supported_cpus);
+               printk(KERN_INFO PFX "Found %d %s "
+                       "processors (" VERSION ")\n", supported_cpus,
+                       boot_cpu_data.x86_model_id);
                return cpufreq_register_driver(&cpufreq_amd64_driver);
        }
 
index 79a7c5c..bf8ad9e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- *  (c) 2003, 2004, 2005 Advanced Micro Devices, Inc.
+ *  (c) 2003-2006 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -21,8 +21,8 @@ struct powernow_k8_data {
        u32 plllock; /* pll lock time, units 1 us */
         u32 exttype; /* extended interface = 1 */
 
-       /* keep track of the current fid / vid */
-       u32 currvid, currfid;
+       /* keep track of the current fid / vid or did */
+       u32 currvid, currfid, currdid;
 
        /* the powernow_table includes all frequency and vid/fid pairings:
         * fid are the lower 8 bits of the index, vid are the upper 8 bits.
@@ -34,6 +34,10 @@ struct powernow_k8_data {
         * used to determine valid frequency/vid/fid states */
        struct acpi_processor_performance acpi_data;
 #endif
+       /* we need to keep track of associated cores, but let cpufreq
+        * handle hotplug events - so just point at cpufreq pol->cpus
+        * structure */
+       cpumask_t *available_cores;
 };
 
 
@@ -43,6 +47,7 @@ struct powernow_k8_data {
 #define CPUID_XFAM_K8                  0
 #define CPUID_XMOD                     0x000f0000      /* extended model */
 #define CPUID_XMOD_REV_G               0x00060000
+#define CPUID_XFAM_10H                         0x00100000      /* family 0x10 */
 #define CPUID_USE_XFAM_XMOD            0x00000f00
 #define CPUID_GET_MAX_CAPABILITIES     0x80000000
 #define CPUID_FREQ_VOLT_CAPABILITIES   0x80000007
@@ -79,6 +84,32 @@ struct powernow_k8_data {
 #define MSR_S_HI_CURRENT_VID      0x0000003f
 #define MSR_C_HI_STP_GNT_BENIGN          0x00000001
 
+
+/* Hardware Pstate _PSS and MSR definitions */
+#define USE_HW_PSTATE          0x00000080
+#define HW_PSTATE_FID_MASK     0x0000003f
+#define HW_PSTATE_DID_MASK     0x000001c0
+#define HW_PSTATE_DID_SHIFT    6
+#define HW_PSTATE_MASK                 0x00000007
+#define HW_PSTATE_VALID_MASK   0x80000000
+#define HW_FID_INDEX_SHIFT     8
+#define HW_FID_INDEX_MASK      0x0000ff00
+#define HW_DID_INDEX_SHIFT     16
+#define HW_DID_INDEX_MASK      0x00ff0000
+#define HW_WATTS_MASK          0xff
+#define HW_PWR_DVR_MASK                0x300
+#define HW_PWR_DVR_SHIFT       8
+#define HW_PWR_MAX_MULT                3
+#define MAX_HW_PSTATE          8       /* hw pstate supports up to 8 */
+#define MSR_PSTATE_DEF_BASE    0xc0010064 /* base of Pstate MSRs */
+#define MSR_PSTATE_STATUS      0xc0010063 /* Pstate Status MSR */
+#define MSR_PSTATE_CTRL        0xc0010062 /* Pstate control MSR */
+
+/* define the two driver architectures */
+#define CPU_OPTERON 0
+#define CPU_HW_PSTATE 1
+
+
 /*
  * There are restrictions frequencies have to follow:
  * - only 1 entry in the low fid table ( <=1.4GHz )
@@ -182,6 +213,9 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
 
 static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
 
+static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
+static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
+
 #ifdef CONFIG_SMP
 static inline void define_siblings(int cpu, cpumask_t cpu_sharedcore_mask[])
 {
index b0ff907..ce54ff1 100644 (file)
@@ -250,7 +250,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
 
        if (model->cpu_id == NULL) {
                /* No match at all */
-               dprintk(KERN_INFO PFX "no support for CPU model \"%s\": "
+               dprintk("no support for CPU model \"%s\": "
                       "send /proc/cpuinfo to " MAINTAINER "\n",
                       cpu->x86_model_id);
                return -ENOENT;
@@ -258,10 +258,10 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
 
        if (model->op_points == NULL) {
                /* Matched a non-match */
-               dprintk(KERN_INFO PFX "no table support for CPU model \"%s\"\n",
+               dprintk("no table support for CPU model \"%s\"\n",
                       cpu->x86_model_id);
 #ifndef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
-               dprintk(KERN_INFO PFX "try compiling with CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI enabled\n");
+               dprintk("try compiling with CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI enabled\n");
 #endif
                return -ENOENT;
        }
@@ -368,7 +368,7 @@ static int centrino_cpu_init_acpi(struct cpufreq_policy *policy)
 
        /* register with ACPI core */
        if (acpi_processor_register_performance(&p, cpu)) {
-               dprintk(KERN_INFO PFX "obtaining ACPI data failed\n");
+               dprintk("obtaining ACPI data failed\n");
                return -EIO;
        }
 
@@ -465,7 +465,7 @@ static int centrino_cpu_init_acpi(struct cpufreq_policy *policy)
        kfree(centrino_model[cpu]);
  err_unreg:
        acpi_processor_unregister_performance(&p, cpu);
-       dprintk(KERN_INFO PFX "invalid ACPI data\n");
+       dprintk("invalid ACPI data\n");
        return (result);
 }
 #else
@@ -499,7 +499,7 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
                        centrino_cpu[policy->cpu] = &cpu_ids[i];
 
                if (!centrino_cpu[policy->cpu]) {
-                       dprintk(KERN_INFO PFX "found unsupported CPU with "
+                       dprintk("found unsupported CPU with "
                        "Enhanced SpeedStep: send /proc/cpuinfo to "
                        MAINTAINER "\n");
                        return -ENODEV;
index aeb5ab2..8b11ceb 100644 (file)
@@ -72,4 +72,6 @@ source "drivers/edac/Kconfig"
 
 source "drivers/rtc/Kconfig"
 
+source "drivers/dma/Kconfig"
+
 endmenu
index 447d8e6..3c51703 100644 (file)
@@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN)          += sn/
 obj-y                          += firmware/
 obj-$(CONFIG_CRYPTO)           += crypto/
 obj-$(CONFIG_SUPERH)           += sh/
+obj-$(CONFIG_DMA_ENGINE)       += dma/
index 07bc6df..8920e8c 100644 (file)
@@ -812,6 +812,9 @@ static int irqrouter_resume(struct sys_device *dev)
 
        ACPI_FUNCTION_TRACE("irqrouter_resume");
 
+       /* Make sure SCI is enabled again (Apple firmware bug?) */
+       acpi_set_register(ACPI_BITREG_SCI_ENABLE, 1, ACPI_MTX_DO_NOT_LOCK);
+
        acpi_in_resume = 1;
        list_for_each(node, &acpi_link.entries) {
                link = list_entry(node, struct acpi_pci_link, node);
index fdff774..c1434ed 100644 (file)
@@ -116,8 +116,7 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
        skb = skb_share_check(skb, GFP_ATOMIC);
        if (skb == NULL)
                return 0;
-       if (skb_is_nonlinear(skb))
-       if (skb_linearize(skb, GFP_ATOMIC) < 0)
+       if (skb_linearize(skb))
                goto exit;
        if (!is_aoe_netif(ifp))
                goto exit;
index 2b5838e..b4e00a3 100644 (file)
@@ -46,12 +46,6 @@ struct vm_operations_struct alpha_core_agp_vm_ops = {
 };
 
 
-static int alpha_core_agp_nop(void)
-{
-       /* just return success */
-       return 0;
-}
-
 static int alpha_core_agp_fetch_size(void)
 {
        return alpha_core_agp_sizes[0].size;
@@ -120,6 +114,11 @@ static int alpha_core_agp_remove_memory(struct agp_memory *mem, off_t pg_start,
        return status;
 }
 
+static int alpha_core_agp_create_free_gatt_table(struct agp_bridge_data *a)
+{
+       return 0;
+}
+
 struct agp_bridge_driver alpha_core_agp_driver = {
        .owner                  = THIS_MODULE,
        .aperture_sizes         = alpha_core_agp_sizes,
@@ -135,8 +134,8 @@ struct agp_bridge_driver alpha_core_agp_driver = {
        .tlb_flush              = alpha_core_agp_tlbflush,
        .mask_memory            = agp_generic_mask_memory,
        .cache_flush            = global_cache_flush,
-       .create_gatt_table      = alpha_core_agp_nop,
-       .free_gatt_table        = alpha_core_agp_nop,
+       .create_gatt_table      = alpha_core_agp_create_free_gatt_table,
+       .free_gatt_table        = alpha_core_agp_create_free_gatt_table,
        .insert_memory          = alpha_core_agp_insert_memory,
        .remove_memory          = alpha_core_agp_remove_memory,
        .alloc_by_type          = agp_generic_alloc_by_type,
index 4e1891e..a92ab53 100644 (file)
@@ -809,12 +809,10 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
                                case U32_APER_SIZE:
                                        bridge->current_size = A_IDX32(bridge);
                                        break;
-                                       /* This case will never really happen. */
+                               /* These cases will never really happen. */
                                case FIXED_APER_SIZE:
                                case LVL2_APER_SIZE:
                                default:
-                                       bridge->current_size =
-                                           bridge->current_size;
                                        break;
                                }
                                temp = bridge->current_size;
index bddcae5..61ac380 100644 (file)
@@ -736,7 +736,7 @@ static int intel_i915_remove_entries(struct agp_memory *mem,off_t pg_start,
 static int intel_i915_fetch_size(void)
 {
        struct aper_size_info_fixed *values;
-       u32 temp, offset = 0;
+       u32 temp, offset;
 
 #define I915_256MB_ADDRESS_MASK (1<<27)
 
index 9846def..1de1b12 100644 (file)
@@ -329,7 +329,7 @@ static int agp_uninorth_suspend(struct pci_dev *pdev)
        /* turn off AGP on the bridge */
        agp = pci_find_capability(pdev, PCI_CAP_ID_AGP);
        pci_read_config_dword(pdev, agp + PCI_AGP_COMMAND, &cmd);
-       bridge->dev_private_data = (void *)cmd;
+       bridge->dev_private_data = (void *)(long)cmd;
        if (cmd & PCI_AGP_COMMAND_AGP) {
                printk("uninorth-agp: disabling AGP on bridge %s\n",
                                pci_name(pdev));
@@ -351,7 +351,7 @@ static int agp_uninorth_resume(struct pci_dev *pdev)
        if (bridge == NULL)
                return -ENODEV;
 
-       command = (u32)bridge->dev_private_data;
+       command = (long)bridge->dev_private_data;
        bridge->dev_private_data = NULL;
        if (!(command & PCI_AGP_COMMAND_AGP))
                return 0;
index 9f2f00d..05f8ce2 100644 (file)
@@ -127,7 +127,7 @@ void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id)
 
        if (found) {
                cn_queue_free_callback(cbq);
-               atomic_dec_and_test(&dev->refcnt);
+               atomic_dec(&dev->refcnt);
        }
 }
 
index 29b2fa5..44d1eca 100644 (file)
@@ -257,7 +257,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
                if (!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {
                        if ((policy) && (policy->cpu == freqs->cpu) &&
                            (policy->cur) && (policy->cur != freqs->old)) {
-                               dprintk(KERN_WARNING "Warning: CPU frequency is"
+                               dprintk("Warning: CPU frequency is"
                                        " %u, cpufreq assumed %u kHz.\n",
                                        freqs->old, policy->cur);
                                freqs->old = policy->cur;
@@ -874,7 +874,7 @@ static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq, unsigne
 {
        struct cpufreq_freqs freqs;
 
-       dprintk(KERN_WARNING "Warning: CPU frequency out of sync: cpufreq and timing "
+       dprintk("Warning: CPU frequency out of sync: cpufreq and timing "
               "core thinks of %u, is %u kHz.\n", old_freq, new_freq);
 
        freqs.cpu = cpu;
@@ -1006,7 +1006,7 @@ static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
                struct cpufreq_freqs freqs;
 
                if (!(cpufreq_driver->flags & CPUFREQ_PM_NO_WARN))
-                       dprintk(KERN_DEBUG "Warning: CPU frequency is %u, "
+                       dprintk("Warning: CPU frequency is %u, "
                               "cpufreq assumed %u kHz.\n",
                               cur_freq, cpu_policy->cur);
 
@@ -1087,7 +1087,7 @@ static int cpufreq_resume(struct sys_device * sysdev)
                        struct cpufreq_freqs freqs;
 
                        if (!(cpufreq_driver->flags & CPUFREQ_PM_NO_WARN))
-                               dprintk(KERN_WARNING "Warning: CPU frequency"
+                               dprintk("Warning: CPU frequency"
                                       "is %u, cpufreq assumed %u kHz.\n",
                                       cur_freq, cpu_policy->cur);
 
index 9694b6e..c576c0b 100644 (file)
@@ -74,7 +74,7 @@ static ssize_t
 show_total_trans(struct cpufreq_policy *policy, char *buf)
 {
        struct cpufreq_stats *stat = cpufreq_stats_table[policy->cpu];
-       if(!stat)
+       if (!stat)
                return 0;
        return sprintf(buf, "%d\n",
                        cpufreq_stats_table[stat->cpu]->total_trans);
@@ -86,7 +86,7 @@ show_time_in_state(struct cpufreq_policy *policy, char *buf)
        ssize_t len = 0;
        int i;
        struct cpufreq_stats *stat = cpufreq_stats_table[policy->cpu];
-       if(!stat)
+       if (!stat)
                return 0;
        cpufreq_stats_update(stat->cpu);
        for (i = 0; i < stat->state_num; i++) {
@@ -104,7 +104,7 @@ show_trans_table(struct cpufreq_policy *policy, char *buf)
        int i, j;
 
        struct cpufreq_stats *stat = cpufreq_stats_table[policy->cpu];
-       if(!stat)
+       if (!stat)
                return 0;
        cpufreq_stats_update(stat->cpu);
        len += snprintf(buf + len, PAGE_SIZE - len, "   From  :    To\n");
index a4818ce..551f4cc 100644 (file)
@@ -20,7 +20,7 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 {
        unsigned int min_freq = ~0;
        unsigned int max_freq = 0;
-       unsigned int i = 0;
+       unsigned int i;
 
        for (i=0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
                unsigned int freq = table[i].frequency;
@@ -51,7 +51,7 @@ int cpufreq_frequency_table_verify(struct cpufreq_policy *policy,
                                   struct cpufreq_frequency_table *table)
 {
        unsigned int next_larger = ~0;
-       unsigned int i = 0;
+       unsigned int i;
        unsigned int count = 0;
 
        dprintk("request for verification of policy (%u - %u kHz) for cpu %u\n", policy->min, policy->max, policy->cpu);
@@ -91,20 +91,24 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
                                   unsigned int relation,
                                   unsigned int *index)
 {
-       struct cpufreq_frequency_table optimal = { .index = ~0, };
-       struct cpufreq_frequency_table suboptimal = { .index = ~0, };
+       struct cpufreq_frequency_table optimal = {
+               .index = ~0,
+               .frequency = 0,
+       };
+       struct cpufreq_frequency_table suboptimal = {
+               .index = ~0,
+               .frequency = 0,
+       };
        unsigned int i;
 
        dprintk("request for target %u kHz (relation: %u) for cpu %u\n", target_freq, relation, policy->cpu);
 
        switch (relation) {
        case CPUFREQ_RELATION_H:
-               optimal.frequency = 0;
                suboptimal.frequency = ~0;
                break;
        case CPUFREQ_RELATION_L:
                optimal.frequency = ~0;
-               suboptimal.frequency = 0;
                break;
        }
 
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644 (file)
index 0000000..30d021d
--- /dev/null
@@ -0,0 +1,34 @@
+#
+# DMA engine configuration
+#
+
+menu "DMA Engine support"
+
+config DMA_ENGINE
+       bool "Support for DMA engines"
+       ---help---
+         DMA engines offload copy operations from the CPU to dedicated
+         hardware, allowing the copies to happen asynchronously.
+
+comment "DMA Clients"
+
+config NET_DMA
+       bool "Network: TCP receive copy offload"
+       depends on DMA_ENGINE && NET
+       default y
+       ---help---
+         This enables the use of DMA engines in the network stack to
+         offload receive copy-to-user operations, freeing CPU cycles.
+         Since this is the main user of the DMA engine, it should be enabled;
+         say Y here.
+
+comment "DMA Devices"
+
+config INTEL_IOATDMA
+       tristate "Intel I/OAT DMA support"
+       depends on DMA_ENGINE && PCI
+       default m
+       ---help---
+         Enable support for the Intel(R) I/OAT DMA engine.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644 (file)
index 0000000..bdcfdbd
--- /dev/null
@@ -0,0 +1,3 @@
+obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
+obj-$(CONFIG_NET_DMA) += iovlock.o
+obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644 (file)
index 0000000..5829143
--- /dev/null
@@ -0,0 +1,408 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code implements the DMA subsystem. It provides a HW-neutral interface
+ * for other kernel code to use asynchronous memory copy capabilities,
+ * if present, and allows different HW DMA drivers to register as providing
+ * this capability.
+ *
+ * Due to the fact we are accelerating what is already a relatively fast
+ * operation, the code goes to great lengths to avoid additional overhead,
+ * such as locking.
+ *
+ * LOCKING:
+ *
+ * The subsystem keeps two global lists, dma_device_list and dma_client_list.
+ * Both of these are protected by a mutex, dma_list_mutex.
+ *
+ * Each device has a channels list, which runs unlocked but is never modified
+ * once the device is registered, it's just setup by the driver.
+ *
+ * Each client has a channels list, it's only modified under the client->lock
+ * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ *
+ * Each device has a kref, which is initialized to 1 when the device is
+ * registered. A kref_put is done for each class_device registered.  When the
+ * class_device is released, the coresponding kref_put is done in the release
+ * method. Every time one of the device's channels is allocated to a client,
+ * a kref_get occurs.  When the channel is freed, the coresponding kref_put
+ * happens. The device's release function does a completion, so
+ * unregister_device does a remove event, class_device_unregister, a kref_put
+ * for the first reference, then waits on the completion for all other
+ * references to finish.
+ *
+ * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
+ * with a kref and a per_cpu local_t.  A single reference is set when on an
+ * ADDED event, and removed with a REMOVE event.  Net DMA client takes an
+ * extra reference per outstanding transaction.  The relase function does a
+ * kref_put on the device. -ChrisL
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/mutex.h>
+
+static DEFINE_MUTEX(dma_list_mutex);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs implementation --- */
+
+static ssize_t show_memcpy_count(struct class_device *cd, char *buf)
+{
+       struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+       unsigned long count = 0;
+       int i;
+
+       for_each_possible_cpu(i)
+               count += per_cpu_ptr(chan->local, i)->memcpy_count;
+
+       return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
+{
+       struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+       unsigned long count = 0;
+       int i;
+
+       for_each_possible_cpu(i)
+               count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+
+       return sprintf(buf, "%lu\n", count);
+}
+
+static ssize_t show_in_use(struct class_device *cd, char *buf)
+{
+       struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+
+       return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+}
+
+static struct class_device_attribute dma_class_attrs[] = {
+       __ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
+       __ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
+       __ATTR(in_use, S_IRUGO, show_in_use, NULL),
+       __ATTR_NULL
+};
+
+static void dma_async_device_cleanup(struct kref *kref);
+
+static void dma_class_dev_release(struct class_device *cd)
+{
+       struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+       kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static struct class dma_devclass = {
+       .name            = "dma",
+       .class_dev_attrs = dma_class_attrs,
+       .release = dma_class_dev_release,
+};
+
+/* --- client and device registration --- */
+
+/**
+ * dma_client_chan_alloc - try to allocate a channel to a client
+ * @client: &dma_client
+ *
+ * Called with dma_list_mutex held.
+ */
+static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
+{
+       struct dma_device *device;
+       struct dma_chan *chan;
+       unsigned long flags;
+       int desc;       /* allocated descriptor count */
+
+       /* Find a channel, any DMA engine will do */
+       list_for_each_entry(device, &dma_device_list, global_node) {
+               list_for_each_entry(chan, &device->channels, device_node) {
+                       if (chan->client)
+                               continue;
+
+                       desc = chan->device->device_alloc_chan_resources(chan);
+                       if (desc >= 0) {
+                               kref_get(&device->refcount);
+                               kref_init(&chan->refcount);
+                               chan->slow_ref = 0;
+                               INIT_RCU_HEAD(&chan->rcu);
+                               chan->client = client;
+                               spin_lock_irqsave(&client->lock, flags);
+                               list_add_tail_rcu(&chan->client_node,
+                                                 &client->channels);
+                               spin_unlock_irqrestore(&client->lock, flags);
+                               return chan;
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+/**
+ * dma_client_chan_free - release a DMA channel
+ * @chan: &dma_chan
+ */
+void dma_chan_cleanup(struct kref *kref)
+{
+       struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
+       chan->device->device_free_chan_resources(chan);
+       chan->client = NULL;
+       kref_put(&chan->device->refcount, dma_async_device_cleanup);
+}
+
+static void dma_chan_free_rcu(struct rcu_head *rcu)
+{
+       struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
+       int bias = 0x7FFFFFFF;
+       int i;
+       for_each_possible_cpu(i)
+               bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
+       atomic_sub(bias, &chan->refcount.refcount);
+       kref_put(&chan->refcount, dma_chan_cleanup);
+}
+
+static void dma_client_chan_free(struct dma_chan *chan)
+{
+       atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
+       chan->slow_ref = 1;
+       call_rcu(&chan->rcu, dma_chan_free_rcu);
+}
+
+/**
+ * dma_chans_rebalance - reallocate channels to clients
+ *
+ * When the number of DMA channel in the system changes,
+ * channels need to be rebalanced among clients
+ */
+static void dma_chans_rebalance(void)
+{
+       struct dma_client *client;
+       struct dma_chan *chan;
+       unsigned long flags;
+
+       mutex_lock(&dma_list_mutex);
+
+       list_for_each_entry(client, &dma_client_list, global_node) {
+               while (client->chans_desired > client->chan_count) {
+                       chan = dma_client_chan_alloc(client);
+                       if (!chan)
+                               break;
+                       client->chan_count++;
+                       client->event_callback(client,
+                                              chan,
+                                              DMA_RESOURCE_ADDED);
+               }
+               while (client->chans_desired < client->chan_count) {
+                       spin_lock_irqsave(&client->lock, flags);
+                       chan = list_entry(client->channels.next,
+                                         struct dma_chan,
+                                         client_node);
+                       list_del_rcu(&chan->client_node);
+                       spin_unlock_irqrestore(&client->lock, flags);
+                       client->chan_count--;
+                       client->event_callback(client,
+                                              chan,
+                                              DMA_RESOURCE_REMOVED);
+                       dma_client_chan_free(chan);
+               }
+       }
+
+       mutex_unlock(&dma_list_mutex);
+}
+
+/**
+ * dma_async_client_register - allocate and register a &dma_client
+ * @event_callback: callback for notification of channel addition/removal
+ */
+struct dma_client *dma_async_client_register(dma_event_callback event_callback)
+{
+       struct dma_client *client;
+
+       client = kzalloc(sizeof(*client), GFP_KERNEL);
+       if (!client)
+               return NULL;
+
+       INIT_LIST_HEAD(&client->channels);
+       spin_lock_init(&client->lock);
+       client->chans_desired = 0;
+       client->chan_count = 0;
+       client->event_callback = event_callback;
+
+       mutex_lock(&dma_list_mutex);
+       list_add_tail(&client->global_node, &dma_client_list);
+       mutex_unlock(&dma_list_mutex);
+
+       return client;
+}
+
+/**
+ * dma_async_client_unregister - unregister a client and free the &dma_client
+ * @client:
+ *
+ * Force frees any allocated DMA channels, frees the &dma_client memory
+ */
+void dma_async_client_unregister(struct dma_client *client)
+{
+       struct dma_chan *chan;
+
+       if (!client)
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(chan, &client->channels, client_node)
+               dma_client_chan_free(chan);
+       rcu_read_unlock();
+
+       mutex_lock(&dma_list_mutex);
+       list_del(&client->global_node);
+       mutex_unlock(&dma_list_mutex);
+
+       kfree(client);
+       dma_chans_rebalance();
+}
+
+/**
+ * dma_async_client_chan_request - request DMA channels
+ * @client: &dma_client
+ * @number: count of DMA channels requested
+ *
+ * Clients call dma_async_client_chan_request() to specify how many
+ * DMA channels they need, 0 to free all currently allocated.
+ * The resulting allocations/frees are indicated to the client via the
+ * event callback.
+ */
+void dma_async_client_chan_request(struct dma_client *client,
+                       unsigned int number)
+{
+       client->chans_desired = number;
+       dma_chans_rebalance();
+}
+
+/**
+ * dma_async_device_register -
+ * @device: &dma_device
+ */
+int dma_async_device_register(struct dma_device *device)
+{
+       static int id;
+       int chancnt = 0;
+       struct dma_chan* chan;
+
+       if (!device)
+               return -ENODEV;
+
+       init_completion(&device->done);
+       kref_init(&device->refcount);
+       device->dev_id = id++;
+
+       /* represent channels in sysfs. Probably want devs too */
+       list_for_each_entry(chan, &device->channels, device_node) {
+               chan->local = alloc_percpu(typeof(*chan->local));
+               if (chan->local == NULL)
+                       continue;
+
+               chan->chan_id = chancnt++;
+               chan->class_dev.class = &dma_devclass;
+               chan->class_dev.dev = NULL;
+               snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d",
+                        device->dev_id, chan->chan_id);
+
+               kref_get(&device->refcount);
+               class_device_register(&chan->class_dev);
+       }
+
+       mutex_lock(&dma_list_mutex);
+       list_add_tail(&device->global_node, &dma_device_list);
+       mutex_unlock(&dma_list_mutex);
+
+       dma_chans_rebalance();
+
+       return 0;
+}
+
+/**
+ * dma_async_device_unregister -
+ * @device: &dma_device
+ */
+static void dma_async_device_cleanup(struct kref *kref)
+{
+       struct dma_device *device;
+
+       device = container_of(kref, struct dma_device, refcount);
+       complete(&device->done);
+}
+
+void dma_async_device_unregister(struct dma_device* device)
+{
+       struct dma_chan *chan;
+       unsigned long flags;
+
+       mutex_lock(&dma_list_mutex);
+       list_del(&device->global_node);
+       mutex_unlock(&dma_list_mutex);
+
+       list_for_each_entry(chan, &device->channels, device_node) {
+               if (chan->client) {
+                       spin_lock_irqsave(&chan->client->lock, flags);
+                       list_del(&chan->client_node);
+                       chan->client->chan_count--;
+                       spin_unlock_irqrestore(&chan->client->lock, flags);
+                       chan->client->event_callback(chan->client,
+                                                    chan,
+                                                    DMA_RESOURCE_REMOVED);
+                       dma_client_chan_free(chan);
+               }
+               class_device_unregister(&chan->class_dev);
+       }
+       dma_chans_rebalance();
+
+       kref_put(&device->refcount, dma_async_device_cleanup);
+       wait_for_completion(&device->done);
+}
+
+static int __init dma_bus_init(void)
+{
+       mutex_init(&dma_list_mutex);
+       return class_register(&dma_devclass);
+}
+
+subsys_initcall(dma_bus_init);
+
+EXPORT_SYMBOL(dma_async_client_register);
+EXPORT_SYMBOL(dma_async_client_unregister);
+EXPORT_SYMBOL(dma_async_client_chan_request);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
+EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
+EXPORT_SYMBOL(dma_async_memcpy_complete);
+EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_device_register);
+EXPORT_SYMBOL(dma_async_device_unregister);
+EXPORT_SYMBOL(dma_chan_cleanup);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
new file mode 100644 (file)
index 0000000..0fdf7fb
--- /dev/null
@@ -0,0 +1,840 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This driver supports an Intel I/OAT DMA engine, which does asynchronous
+ * copy operations.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include "ioatdma.h"
+#include "ioatdma_io.h"
+#include "ioatdma_registers.h"
+#include "ioatdma_hw.h"
+
+#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common)
+#define to_ioat_device(dev) container_of(dev, struct ioat_device, common)
+#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
+
+/* internal functions */
+static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
+static void __devexit ioat_remove(struct pci_dev *pdev);
+
+static int enumerate_dma_channels(struct ioat_device *device)
+{
+       u8 xfercap_scale;
+       u32 xfercap;
+       int i;
+       struct ioat_dma_chan *ioat_chan;
+
+       device->common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET);
+       xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET);
+       xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale));
+
+       for (i = 0; i < device->common.chancnt; i++) {
+               ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL);
+               if (!ioat_chan) {
+                       device->common.chancnt = i;
+                       break;
+               }
+
+               ioat_chan->device = device;
+               ioat_chan->reg_base = device->reg_base + (0x80 * (i + 1));
+               ioat_chan->xfercap = xfercap;
+               spin_lock_init(&ioat_chan->cleanup_lock);
+               spin_lock_init(&ioat_chan->desc_lock);
+               INIT_LIST_HEAD(&ioat_chan->free_desc);
+               INIT_LIST_HEAD(&ioat_chan->used_desc);
+               /* This should be made common somewhere in dmaengine.c */
+               ioat_chan->common.device = &device->common;
+               ioat_chan->common.client = NULL;
+               list_add_tail(&ioat_chan->common.device_node,
+                             &device->common.channels);
+       }
+       return device->common.chancnt;
+}
+
+static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
+       struct ioat_dma_chan *ioat_chan,
+       int flags)
+{
+       struct ioat_dma_descriptor *desc;
+       struct ioat_desc_sw *desc_sw;
+       struct ioat_device *ioat_device;
+       dma_addr_t phys;
+
+       ioat_device = to_ioat_device(ioat_chan->common.device);
+       desc = pci_pool_alloc(ioat_device->dma_pool, flags, &phys);
+       if (unlikely(!desc))
+               return NULL;
+
+       desc_sw = kzalloc(sizeof(*desc_sw), flags);
+       if (unlikely(!desc_sw)) {
+               pci_pool_free(ioat_device->dma_pool, desc, phys);
+               return NULL;
+       }
+
+       memset(desc, 0, sizeof(*desc));
+       desc_sw->hw = desc;
+       desc_sw->phys = phys;
+
+       return desc_sw;
+}
+
+#define INITIAL_IOAT_DESC_COUNT 128
+
+static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan);
+
+/* returns the actual number of allocated descriptors */
+static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+       struct ioat_desc_sw *desc = NULL;
+       u16 chanctrl;
+       u32 chanerr;
+       int i;
+       LIST_HEAD(tmp_list);
+
+       /*
+        * In-use bit automatically set by reading chanctrl
+        * If 0, we got it, if 1, someone else did
+        */
+       chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+       if (chanctrl & IOAT_CHANCTRL_CHANNEL_IN_USE)
+               return -EBUSY;
+
+        /* Setup register to interrupt and write completion status on error */
+       chanctrl = IOAT_CHANCTRL_CHANNEL_IN_USE |
+               IOAT_CHANCTRL_ERR_INT_EN |
+               IOAT_CHANCTRL_ANY_ERR_ABORT_EN |
+               IOAT_CHANCTRL_ERR_COMPLETION_EN;
+        ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+
+       chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET);
+       if (chanerr) {
+               printk("IOAT: CHANERR = %x, clearing\n", chanerr);
+               ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr);
+       }
+
+       /* Allocate descriptors */
+       for (i = 0; i < INITIAL_IOAT_DESC_COUNT; i++) {
+               desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL);
+               if (!desc) {
+                       printk(KERN_ERR "IOAT: Only %d initial descriptors\n", i);
+                       break;
+               }
+               list_add_tail(&desc->node, &tmp_list);
+       }
+       spin_lock_bh(&ioat_chan->desc_lock);
+       list_splice(&tmp_list, &ioat_chan->free_desc);
+       spin_unlock_bh(&ioat_chan->desc_lock);
+
+       /* allocate a completion writeback area */
+       /* doing 2 32bit writes to mmio since 1 64b write doesn't work */
+       ioat_chan->completion_virt =
+               pci_pool_alloc(ioat_chan->device->completion_pool,
+                              GFP_KERNEL,
+                              &ioat_chan->completion_addr);
+       memset(ioat_chan->completion_virt, 0,
+              sizeof(*ioat_chan->completion_virt));
+       ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW,
+                      ((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF);
+       ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH,
+                      ((u64) ioat_chan->completion_addr) >> 32);
+
+       ioat_start_null_desc(ioat_chan);
+       return i;
+}
+
+static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan);
+
+static void ioat_dma_free_chan_resources(struct dma_chan *chan)
+{
+       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+       struct ioat_device *ioat_device = to_ioat_device(chan->device);
+       struct ioat_desc_sw *desc, *_desc;
+       u16 chanctrl;
+       int in_use_descs = 0;
+
+       ioat_dma_memcpy_cleanup(ioat_chan);
+
+       ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET);
+
+       spin_lock_bh(&ioat_chan->desc_lock);
+       list_for_each_entry_safe(desc, _desc, &ioat_chan->used_desc, node) {
+               in_use_descs++;
+               list_del(&desc->node);
+               pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys);
+               kfree(desc);
+       }
+       list_for_each_entry_safe(desc, _desc, &ioat_chan->free_desc, node) {
+               list_del(&desc->node);
+               pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys);
+               kfree(desc);
+       }
+       spin_unlock_bh(&ioat_chan->desc_lock);
+
+       pci_pool_free(ioat_device->completion_pool,
+                     ioat_chan->completion_virt,
+                     ioat_chan->completion_addr);
+
+       /* one is ok since we left it on there on purpose */
+       if (in_use_descs > 1)
+               printk(KERN_ERR "IOAT: Freeing %d in use descriptors!\n",
+                       in_use_descs - 1);
+
+       ioat_chan->last_completion = ioat_chan->completion_addr = 0;
+
+       /* Tell hw the chan is free */
+       chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+       chanctrl &= ~IOAT_CHANCTRL_CHANNEL_IN_USE;
+       ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+}
+
+/**
+ * do_ioat_dma_memcpy - actual function that initiates a IOAT DMA transaction
+ * @chan: IOAT DMA channel handle
+ * @dest: DMA destination address
+ * @src: DMA source address
+ * @len: transaction length in bytes
+ */
+
+static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan,
+                                       dma_addr_t dest,
+                                       dma_addr_t src,
+                                       size_t len)
+{
+       struct ioat_desc_sw *first;
+       struct ioat_desc_sw *prev;
+       struct ioat_desc_sw *new;
+       dma_cookie_t cookie;
+       LIST_HEAD(new_chain);
+       u32 copy;
+       size_t orig_len;
+       dma_addr_t orig_src, orig_dst;
+       unsigned int desc_count = 0;
+       unsigned int append = 0;
+
+       if (!ioat_chan || !dest || !src)
+               return -EFAULT;
+
+       if (!len)
+               return ioat_chan->common.cookie;
+
+       orig_len = len;
+       orig_src = src;
+       orig_dst = dest;
+
+       first = NULL;
+       prev = NULL;
+
+       spin_lock_bh(&ioat_chan->desc_lock);
+
+       while (len) {
+               if (!list_empty(&ioat_chan->free_desc)) {
+                       new = to_ioat_desc(ioat_chan->free_desc.next);
+                       list_del(&new->node);
+               } else {
+                       /* try to get another desc */
+                       new = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC);
+                       /* will this ever happen? */
+                       /* TODO add upper limit on these */
+                       BUG_ON(!new);
+               }
+
+               copy = min((u32) len, ioat_chan->xfercap);
+
+               new->hw->size = copy;
+               new->hw->ctl = 0;
+               new->hw->src_addr = src;
+               new->hw->dst_addr = dest;
+               new->cookie = 0;
+
+               /* chain together the physical address list for the HW */
+               if (!first)
+                       first = new;
+               else
+                       prev->hw->next = (u64) new->phys;
+
+               prev = new;
+
+               len  -= copy;
+               dest += copy;
+               src  += copy;
+
+               list_add_tail(&new->node, &new_chain);
+               desc_count++;
+       }
+       new->hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS;
+       new->hw->next = 0;
+
+       /* cookie incr and addition to used_list must be atomic */
+
+       cookie = ioat_chan->common.cookie;
+       cookie++;
+       if (cookie < 0)
+               cookie = 1;
+       ioat_chan->common.cookie = new->cookie = cookie;
+
+       pci_unmap_addr_set(new, src, orig_src);
+       pci_unmap_addr_set(new, dst, orig_dst);
+       pci_unmap_len_set(new, src_len, orig_len);
+       pci_unmap_len_set(new, dst_len, orig_len);
+
+       /* write address into NextDescriptor field of last desc in chain */
+       to_ioat_desc(ioat_chan->used_desc.prev)->hw->next = first->phys;
+       list_splice_init(&new_chain, ioat_chan->used_desc.prev);
+
+       ioat_chan->pending += desc_count;
+       if (ioat_chan->pending >= 20) {
+               append = 1;
+               ioat_chan->pending = 0;
+       }
+
+       spin_unlock_bh(&ioat_chan->desc_lock);
+
+       if (append)
+               ioatdma_chan_write8(ioat_chan,
+                                   IOAT_CHANCMD_OFFSET,
+                                   IOAT_CHANCMD_APPEND);
+       return cookie;
+}
+
+/**
+ * ioat_dma_memcpy_buf_to_buf - wrapper that takes src & dest bufs
+ * @chan: IOAT DMA channel handle
+ * @dest: DMA destination address
+ * @src: DMA source address
+ * @len: transaction length in bytes
+ */
+
+static dma_cookie_t ioat_dma_memcpy_buf_to_buf(struct dma_chan *chan,
+                                               void *dest,
+                                               void *src,
+                                               size_t len)
+{
+       dma_addr_t dest_addr;
+       dma_addr_t src_addr;
+       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+
+       dest_addr = pci_map_single(ioat_chan->device->pdev,
+               dest, len, PCI_DMA_FROMDEVICE);
+       src_addr = pci_map_single(ioat_chan->device->pdev,
+               src, len, PCI_DMA_TODEVICE);
+
+       return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
+}
+
+/**
+ * ioat_dma_memcpy_buf_to_pg - wrapper, copying from a buf to a page
+ * @chan: IOAT DMA channel handle
+ * @page: pointer to the page to copy to
+ * @offset: offset into that page
+ * @src: DMA source address
+ * @len: transaction length in bytes
+ */
+
+static dma_cookie_t ioat_dma_memcpy_buf_to_pg(struct dma_chan *chan,
+                                              struct page *page,
+                                              unsigned int offset,
+                                              void *src,
+                                              size_t len)
+{
+       dma_addr_t dest_addr;
+       dma_addr_t src_addr;
+       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+
+       dest_addr = pci_map_page(ioat_chan->device->pdev,
+               page, offset, len, PCI_DMA_FROMDEVICE);
+       src_addr = pci_map_single(ioat_chan->device->pdev,
+               src, len, PCI_DMA_TODEVICE);
+
+       return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
+}
+
+/**
+ * ioat_dma_memcpy_pg_to_pg - wrapper, copying between two pages
+ * @chan: IOAT DMA channel handle
+ * @dest_pg: pointer to the page to copy to
+ * @dest_off: offset into that page
+ * @src_pg: pointer to the page to copy from
+ * @src_off: offset into that page
+ * @len: transaction length in bytes. This is guaranteed to not make a copy
+ *      across a page boundary.
+ */
+
+static dma_cookie_t ioat_dma_memcpy_pg_to_pg(struct dma_chan *chan,
+                                             struct page *dest_pg,
+                                             unsigned int dest_off,
+                                             struct page *src_pg,
+                                             unsigned int src_off,
+                                             size_t len)
+{
+       dma_addr_t dest_addr;
+       dma_addr_t src_addr;
+       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+
+       dest_addr = pci_map_page(ioat_chan->device->pdev,
+               dest_pg, dest_off, len, PCI_DMA_FROMDEVICE);
+       src_addr = pci_map_page(ioat_chan->device->pdev,
+               src_pg, src_off, len, PCI_DMA_TODEVICE);
+
+       return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len);
+}
+
+/**
+ * ioat_dma_memcpy_issue_pending - push potentially unrecognoized appended descriptors to hw
+ * @chan: DMA channel handle
+ */
+
+static void ioat_dma_memcpy_issue_pending(struct dma_chan *chan)
+{
+       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+
+       if (ioat_chan->pending != 0) {
+               ioat_chan->pending = 0;
+               ioatdma_chan_write8(ioat_chan,
+                                   IOAT_CHANCMD_OFFSET,
+                                   IOAT_CHANCMD_APPEND);
+       }
+}
+
+static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan)
+{
+       unsigned long phys_complete;
+       struct ioat_desc_sw *desc, *_desc;
+       dma_cookie_t cookie = 0;
+
+       prefetch(chan->completion_virt);
+
+       if (!spin_trylock(&chan->cleanup_lock))
+               return;
+
+       /* The completion writeback can happen at any time,
+          so reads by the driver need to be atomic operations
+          The descriptor physical addresses are limited to 32-bits
+          when the CPU can only do a 32-bit mov */
+
+#if (BITS_PER_LONG == 64)
+       phys_complete =
+       chan->completion_virt->full & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR;
+#else
+       phys_complete = chan->completion_virt->low & IOAT_LOW_COMPLETION_MASK;
+#endif
+
+       if ((chan->completion_virt->full & IOAT_CHANSTS_DMA_TRANSFER_STATUS) ==
+               IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) {
+               printk("IOAT: Channel halted, chanerr = %x\n",
+                       ioatdma_chan_read32(chan, IOAT_CHANERR_OFFSET));
+
+               /* TODO do something to salvage the situation */
+       }
+
+       if (phys_complete == chan->last_completion) {
+               spin_unlock(&chan->cleanup_lock);
+               return;
+       }
+
+       spin_lock_bh(&chan->desc_lock);
+       list_for_each_entry_safe(desc, _desc, &chan->used_desc, node) {
+
+               /*
+                * Incoming DMA requests may use multiple descriptors, due to
+                * exceeding xfercap, perhaps. If so, only the last one will
+                * have a cookie, and require unmapping.
+                */
+               if (desc->cookie) {
+                       cookie = desc->cookie;
+
+                       /* yes we are unmapping both _page and _single alloc'd
+                          regions with unmap_page. Is this *really* that bad?
+                       */
+                       pci_unmap_page(chan->device->pdev,
+                                       pci_unmap_addr(desc, dst),
+                                       pci_unmap_len(desc, dst_len),
+                                       PCI_DMA_FROMDEVICE);
+                       pci_unmap_page(chan->device->pdev,
+                                       pci_unmap_addr(desc, src),
+                                       pci_unmap_len(desc, src_len),
+                                       PCI_DMA_TODEVICE);
+               }
+
+               if (desc->phys != phys_complete) {
+                       /* a completed entry, but not the last, so cleanup */
+                       list_del(&desc->node);
+                       list_add_tail(&desc->node, &chan->free_desc);
+               } else {
+                       /* last used desc. Do not remove, so we can append from
+                          it, but don't look at it next time, either */
+                       desc->cookie = 0;
+
+                       /* TODO check status bits? */
+                       break;
+               }
+       }
+
+       spin_unlock_bh(&chan->desc_lock);
+
+       chan->last_completion = phys_complete;
+       if (cookie != 0)
+               chan->completed_cookie = cookie;
+
+       spin_unlock(&chan->cleanup_lock);
+}
+
+/**
+ * ioat_dma_is_complete - poll the status of a IOAT DMA transaction
+ * @chan: IOAT DMA channel handle
+ * @cookie: DMA transaction identifier
+ */
+
+static enum dma_status ioat_dma_is_complete(struct dma_chan *chan,
+                                            dma_cookie_t cookie,
+                                            dma_cookie_t *done,
+                                            dma_cookie_t *used)
+{
+       struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+       dma_cookie_t last_used;
+       dma_cookie_t last_complete;
+       enum dma_status ret;
+
+       last_used = chan->cookie;
+       last_complete = ioat_chan->completed_cookie;
+
+       if (done)
+               *done= last_complete;
+       if (used)
+               *used = last_used;
+
+       ret = dma_async_is_complete(cookie, last_complete, last_used);
+       if (ret == DMA_SUCCESS)
+               return ret;
+
+       ioat_dma_memcpy_cleanup(ioat_chan);
+
+       last_used = chan->cookie;
+       last_complete = ioat_chan->completed_cookie;
+
+       if (done)
+               *done= last_complete;
+       if (used)
+               *used = last_used;
+
+       return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+/* PCI API */
+
+static struct pci_device_id ioat_pci_tbl[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) },
+       { 0, }
+};
+
+static struct pci_driver ioat_pci_drv = {
+       .name   = "ioatdma",
+       .id_table = ioat_pci_tbl,
+       .probe  = ioat_probe,
+       .remove = __devexit_p(ioat_remove),
+};
+
+static irqreturn_t ioat_do_interrupt(int irq, void *data, struct pt_regs *regs)
+{
+       struct ioat_device *instance = data;
+       unsigned long attnstatus;
+       u8 intrctrl;
+
+       intrctrl = ioatdma_read8(instance, IOAT_INTRCTRL_OFFSET);
+
+       if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN))
+               return IRQ_NONE;
+
+       if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) {
+               ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl);
+               return IRQ_NONE;
+       }
+
+       attnstatus = ioatdma_read32(instance, IOAT_ATTNSTATUS_OFFSET);
+
+       printk(KERN_ERR "ioatdma error: interrupt! status %lx\n", attnstatus);
+
+       ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl);
+       return IRQ_HANDLED;
+}
+
+static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan)
+{
+       struct ioat_desc_sw *desc;
+
+       spin_lock_bh(&ioat_chan->desc_lock);
+
+       if (!list_empty(&ioat_chan->free_desc)) {
+               desc = to_ioat_desc(ioat_chan->free_desc.next);
+               list_del(&desc->node);
+       } else {
+               /* try to get another desc */
+               spin_unlock_bh(&ioat_chan->desc_lock);
+               desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL);
+               spin_lock_bh(&ioat_chan->desc_lock);
+               /* will this ever happen? */
+               BUG_ON(!desc);
+       }
+
+       desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL;
+       desc->hw->next = 0;
+
+       list_add_tail(&desc->node, &ioat_chan->used_desc);
+       spin_unlock_bh(&ioat_chan->desc_lock);
+
+#if (BITS_PER_LONG == 64)
+       ioatdma_chan_write64(ioat_chan, IOAT_CHAINADDR_OFFSET, desc->phys);
+#else
+       ioatdma_chan_write32(ioat_chan,
+                            IOAT_CHAINADDR_OFFSET_LOW,
+                            (u32) desc->phys);
+       ioatdma_chan_write32(ioat_chan, IOAT_CHAINADDR_OFFSET_HIGH, 0);
+#endif
+       ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_START);
+}
+
+/*
+ * Perform a IOAT transaction to verify the HW works.
+ */
+#define IOAT_TEST_SIZE 2000
+
+static int ioat_self_test(struct ioat_device *device)
+{
+       int i;
+       u8 *src;
+       u8 *dest;
+       struct dma_chan *dma_chan;
+       dma_cookie_t cookie;
+       int err = 0;
+
+       src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL);
+       if (!src)
+               return -ENOMEM;
+       dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL);
+       if (!dest) {
+               kfree(src);
+               return -ENOMEM;
+       }
+
+       /* Fill in src buffer */
+       for (i = 0; i < IOAT_TEST_SIZE; i++)
+               src[i] = (u8)i;
+
+       /* Start copy, using first DMA channel */
+       dma_chan = container_of(device->common.channels.next,
+                               struct dma_chan,
+                               device_node);
+       if (ioat_dma_alloc_chan_resources(dma_chan) < 1) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       cookie = ioat_dma_memcpy_buf_to_buf(dma_chan, dest, src, IOAT_TEST_SIZE);
+       ioat_dma_memcpy_issue_pending(dma_chan);
+       msleep(1);
+
+       if (ioat_dma_is_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) {
+               printk(KERN_ERR "ioatdma: Self-test copy timed out, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+       if (memcmp(src, dest, IOAT_TEST_SIZE)) {
+               printk(KERN_ERR "ioatdma: Self-test copy failed compare, disabling\n");
+               err = -ENODEV;
+               goto free_resources;
+       }
+
+free_resources:
+       ioat_dma_free_chan_resources(dma_chan);
+out:
+       kfree(src);
+       kfree(dest);
+       return err;
+}
+
+static int __devinit ioat_probe(struct pci_dev *pdev,
+                                const struct pci_device_id *ent)
+{
+       int err;
+       unsigned long mmio_start, mmio_len;
+       void *reg_base;
+       struct ioat_device *device;
+
+       err = pci_enable_device(pdev);
+       if (err)
+               goto err_enable_device;
+
+       err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
+       if (err)
+               err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+       if (err)
+               goto err_set_dma_mask;
+
+       err = pci_request_regions(pdev, ioat_pci_drv.name);
+       if (err)
+               goto err_request_regions;
+
+       mmio_start = pci_resource_start(pdev, 0);
+       mmio_len = pci_resource_len(pdev, 0);
+
+       reg_base = ioremap(mmio_start, mmio_len);
+       if (!reg_base) {
+               err = -ENOMEM;
+               goto err_ioremap;
+       }
+
+       device = kzalloc(sizeof(*device), GFP_KERNEL);
+       if (!device) {
+               err = -ENOMEM;
+               goto err_kzalloc;
+       }
+
+       /* DMA coherent memory pool for DMA descriptor allocations */
+       device->dma_pool = pci_pool_create("dma_desc_pool", pdev,
+               sizeof(struct ioat_dma_descriptor), 64, 0);
+       if (!device->dma_pool) {
+               err = -ENOMEM;
+               goto err_dma_pool;
+       }
+
+       device->completion_pool = pci_pool_create("completion_pool", pdev, sizeof(u64), SMP_CACHE_BYTES, SMP_CACHE_BYTES);
+       if (!device->completion_pool) {
+               err = -ENOMEM;
+               goto err_completion_pool;
+       }
+
+       device->pdev = pdev;
+       pci_set_drvdata(pdev, device);
+#ifdef CONFIG_PCI_MSI
+       if (pci_enable_msi(pdev) == 0) {
+               device->msi = 1;
+       } else {
+               device->msi = 0;
+       }
+#endif
+       err = request_irq(pdev->irq, &ioat_do_interrupt, SA_SHIRQ, "ioat",
+               device);
+       if (err)
+               goto err_irq;
+
+       device->reg_base = reg_base;
+
+       ioatdma_write8(device, IOAT_INTRCTRL_OFFSET, IOAT_INTRCTRL_MASTER_INT_EN);
+       pci_set_master(pdev);
+
+       INIT_LIST_HEAD(&device->common.channels);
+       enumerate_dma_channels(device);
+
+       device->common.device_alloc_chan_resources = ioat_dma_alloc_chan_resources;
+       device->common.device_free_chan_resources = ioat_dma_free_chan_resources;
+       device->common.device_memcpy_buf_to_buf = ioat_dma_memcpy_buf_to_buf;
+       device->common.device_memcpy_buf_to_pg = ioat_dma_memcpy_buf_to_pg;
+       device->common.device_memcpy_pg_to_pg = ioat_dma_memcpy_pg_to_pg;
+       device->common.device_memcpy_complete = ioat_dma_is_complete;
+       device->common.device_memcpy_issue_pending = ioat_dma_memcpy_issue_pending;
+       printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n",
+               device->common.chancnt);
+
+       err = ioat_self_test(device);
+       if (err)
+               goto err_self_test;
+
+       dma_async_device_register(&device->common);
+
+       return 0;
+
+err_self_test:
+err_irq:
+       pci_pool_destroy(device->completion_pool);
+err_completion_pool:
+       pci_pool_destroy(device->dma_pool);
+err_dma_pool:
+       kfree(device);
+err_kzalloc:
+       iounmap(reg_base);
+err_ioremap:
+       pci_release_regions(pdev);
+err_request_regions:
+err_set_dma_mask:
+       pci_disable_device(pdev);
+err_enable_device:
+       return err;
+}
+
+static void __devexit ioat_remove(struct pci_dev *pdev)
+{
+       struct ioat_device *device;
+       struct dma_chan *chan, *_chan;
+       struct ioat_dma_chan *ioat_chan;
+
+       device = pci_get_drvdata(pdev);
+       dma_async_device_unregister(&device->common);
+
+       free_irq(device->pdev->irq, device);
+#ifdef CONFIG_PCI_MSI
+       if (device->msi)
+               pci_disable_msi(device->pdev);
+#endif
+       pci_pool_destroy(device->dma_pool);
+       pci_pool_destroy(device->completion_pool);
+       iounmap(device->reg_base);
+       pci_release_regions(pdev);
+       pci_disable_device(pdev);
+       list_for_each_entry_safe(chan, _chan, &device->common.channels, device_node) {
+               ioat_chan = to_ioat_chan(chan);
+               list_del(&chan->device_node);
+               kfree(ioat_chan);
+       }
+       kfree(device);
+}
+
+/* MODULE API */
+MODULE_VERSION("1.7");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+static int __init ioat_init_module(void)
+{
+       /* it's currently unsafe to unload this module */
+       /* if forced, worst case is that rmmod hangs */
+       if (THIS_MODULE != NULL)
+               THIS_MODULE->unsafe = 1;
+
+       return pci_module_init(&ioat_pci_drv);
+}
+
+module_init(ioat_init_module);
+
+static void __exit ioat_exit_module(void)
+{
+       pci_unregister_driver(&ioat_pci_drv);
+}
+
+module_exit(ioat_exit_module);
diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h
new file mode 100644 (file)
index 0000000..a5d3b36
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef IOATDMA_H
+#define IOATDMA_H
+
+#include <linux/dmaengine.h>
+#include "ioatdma_hw.h"
+#include <linux/init.h>
+#include <linux/dmapool.h>
+#include <linux/cache.h>
+#include <linux/pci_ids.h>
+
+#define IOAT_LOW_COMPLETION_MASK       0xffffffc0
+
+extern struct list_head dma_device_list;
+extern struct list_head dma_client_list;
+
+/**
+ * struct ioat_device - internal representation of a IOAT device
+ * @pdev: PCI-Express device
+ * @reg_base: MMIO register space base address
+ * @dma_pool: for allocating DMA descriptors
+ * @common: embedded struct dma_device
+ * @msi: Message Signaled Interrupt number
+ */
+
+struct ioat_device {
+       struct pci_dev *pdev;
+       void *reg_base;
+       struct pci_pool *dma_pool;
+       struct pci_pool *completion_pool;
+
+       struct dma_device common;
+       u8 msi;
+};
+
+/**
+ * struct ioat_dma_chan - internal representation of a DMA channel
+ * @device:
+ * @reg_base:
+ * @sw_in_use:
+ * @completion:
+ * @completion_low:
+ * @completion_high:
+ * @completed_cookie: last cookie seen completed on cleanup
+ * @cookie: value of last cookie given to client
+ * @last_completion:
+ * @xfercap:
+ * @desc_lock:
+ * @free_desc:
+ * @used_desc:
+ * @resource:
+ * @device_node:
+ */
+
+struct ioat_dma_chan {
+
+       void *reg_base;
+
+       dma_cookie_t completed_cookie;
+       unsigned long last_completion;
+
+       u32 xfercap;    /* XFERCAP register value expanded out */
+
+       spinlock_t cleanup_lock;
+       spinlock_t desc_lock;
+       struct list_head free_desc;
+       struct list_head used_desc;
+
+       int pending;
+
+       struct ioat_device *device;
+       struct dma_chan common;
+
+       dma_addr_t completion_addr;
+       union {
+               u64 full; /* HW completion writeback */
+               struct {
+                       u32 low;
+                       u32 high;
+               };
+       } *completion_virt;
+};
+
+/* wrapper around hardware descriptor format + additional software fields */
+
+/**
+ * struct ioat_desc_sw - wrapper around hardware descriptor
+ * @hw: hardware DMA descriptor
+ * @node:
+ * @cookie:
+ * @phys:
+ */
+
+struct ioat_desc_sw {
+       struct ioat_dma_descriptor *hw;
+       struct list_head node;
+       dma_cookie_t cookie;
+       dma_addr_t phys;
+       DECLARE_PCI_UNMAP_ADDR(src)
+       DECLARE_PCI_UNMAP_LEN(src_len)
+       DECLARE_PCI_UNMAP_ADDR(dst)
+       DECLARE_PCI_UNMAP_LEN(dst_len)
+};
+
+#endif /* IOATDMA_H */
+
diff --git a/drivers/dma/ioatdma_hw.h b/drivers/dma/ioatdma_hw.h
new file mode 100644 (file)
index 0000000..4d7a128
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef _IOAT_HW_H_
+#define _IOAT_HW_H_
+
+/* PCI Configuration Space Values */
+#define IOAT_PCI_VID                   0x8086
+#define IOAT_PCI_DID                   0x1A38
+#define IOAT_PCI_RID                   0x00
+#define IOAT_PCI_SVID                  0x8086
+#define IOAT_PCI_SID                   0x8086
+#define IOAT_VER                       0x12    /* Version 1.2 */
+
+struct ioat_dma_descriptor {
+       uint32_t        size;
+       uint32_t        ctl;
+       uint64_t        src_addr;
+       uint64_t        dst_addr;
+       uint64_t        next;
+       uint64_t        rsv1;
+       uint64_t        rsv2;
+       uint64_t        user1;
+       uint64_t        user2;
+};
+
+#define IOAT_DMA_DESCRIPTOR_CTL_INT_GN 0x00000001
+#define IOAT_DMA_DESCRIPTOR_CTL_SRC_SN 0x00000002
+#define IOAT_DMA_DESCRIPTOR_CTL_DST_SN 0x00000004
+#define IOAT_DMA_DESCRIPTOR_CTL_CP_STS 0x00000008
+#define IOAT_DMA_DESCRIPTOR_CTL_FRAME  0x00000010
+#define IOAT_DMA_DESCRIPTOR_NUL                0x00000020
+#define IOAT_DMA_DESCRIPTOR_OPCODE     0xFF000000
+
+#endif
diff --git a/drivers/dma/ioatdma_io.h b/drivers/dma/ioatdma_io.h
new file mode 100644 (file)
index 0000000..c0b4bf6
--- /dev/null
@@ -0,0 +1,118 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef IOATDMA_IO_H
+#define IOATDMA_IO_H
+
+#include <asm/io.h>
+
+/*
+ * device and per-channel MMIO register read and write functions
+ * this is a lot of anoying inline functions, but it's typesafe
+ */
+
+static inline u8 ioatdma_read8(struct ioat_device *device,
+                               unsigned int offset)
+{
+       return readb(device->reg_base + offset);
+}
+
+static inline u16 ioatdma_read16(struct ioat_device *device,
+                                 unsigned int offset)
+{
+       return readw(device->reg_base + offset);
+}
+
+static inline u32 ioatdma_read32(struct ioat_device *device,
+                                 unsigned int offset)
+{
+       return readl(device->reg_base + offset);
+}
+
+static inline void ioatdma_write8(struct ioat_device *device,
+                                  unsigned int offset, u8 value)
+{
+       writeb(value, device->reg_base + offset);
+}
+
+static inline void ioatdma_write16(struct ioat_device *device,
+                                   unsigned int offset, u16 value)
+{
+       writew(value, device->reg_base + offset);
+}
+
+static inline void ioatdma_write32(struct ioat_device *device,
+                                   unsigned int offset, u32 value)
+{
+       writel(value, device->reg_base + offset);
+}
+
+static inline u8 ioatdma_chan_read8(struct ioat_dma_chan *chan,
+                                    unsigned int offset)
+{
+       return readb(chan->reg_base + offset);
+}
+
+static inline u16 ioatdma_chan_read16(struct ioat_dma_chan *chan,
+                                      unsigned int offset)
+{
+       return readw(chan->reg_base + offset);
+}
+
+static inline u32 ioatdma_chan_read32(struct ioat_dma_chan *chan,
+                                      unsigned int offset)
+{
+       return readl(chan->reg_base + offset);
+}
+
+static inline void ioatdma_chan_write8(struct ioat_dma_chan *chan,
+                                       unsigned int offset, u8 value)
+{
+       writeb(value, chan->reg_base + offset);
+}
+
+static inline void ioatdma_chan_write16(struct ioat_dma_chan *chan,
+                                        unsigned int offset, u16 value)
+{
+       writew(value, chan->reg_base + offset);
+}
+
+static inline void ioatdma_chan_write32(struct ioat_dma_chan *chan,
+                                        unsigned int offset, u32 value)
+{
+       writel(value, chan->reg_base + offset);
+}
+
+#if (BITS_PER_LONG == 64)
+static inline u64 ioatdma_chan_read64(struct ioat_dma_chan *chan,
+                                      unsigned int offset)
+{
+       return readq(chan->reg_base + offset);
+}
+
+static inline void ioatdma_chan_write64(struct ioat_dma_chan *chan,
+                                        unsigned int offset, u64 value)
+{
+       writeq(value, chan->reg_base + offset);
+}
+#endif
+
+#endif /* IOATDMA_IO_H */
+
diff --git a/drivers/dma/ioatdma_registers.h b/drivers/dma/ioatdma_registers.h
new file mode 100644 (file)
index 0000000..41a21ab
--- /dev/null
@@ -0,0 +1,126 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef _IOAT_REGISTERS_H_
+#define _IOAT_REGISTERS_H_
+
+
+/* MMIO Device Registers */
+#define IOAT_CHANCNT_OFFSET                    0x00    /*  8-bit */
+
+#define IOAT_XFERCAP_OFFSET                    0x01    /*  8-bit */
+#define IOAT_XFERCAP_4KB                       12
+#define IOAT_XFERCAP_8KB                       13
+#define IOAT_XFERCAP_16KB                      14
+#define IOAT_XFERCAP_32KB                      15
+#define IOAT_XFERCAP_32GB                      0
+
+#define IOAT_GENCTRL_OFFSET                    0x02    /*  8-bit */
+#define IOAT_GENCTRL_DEBUG_EN                  0x01
+
+#define IOAT_INTRCTRL_OFFSET                   0x03    /*  8-bit */
+#define IOAT_INTRCTRL_MASTER_INT_EN            0x01    /* Master Interrupt Enable */
+#define IOAT_INTRCTRL_INT_STATUS               0x02    /* ATTNSTATUS -or- Channel Int */
+#define IOAT_INTRCTRL_INT                      0x04    /* INT_STATUS -and- MASTER_INT_EN */
+
+#define IOAT_ATTNSTATUS_OFFSET                 0x04    /* Each bit is a channel */
+
+#define IOAT_VER_OFFSET                                0x08    /*  8-bit */
+#define IOAT_VER_MAJOR_MASK                    0xF0
+#define IOAT_VER_MINOR_MASK                    0x0F
+#define GET_IOAT_VER_MAJOR(x)                  ((x) & IOAT_VER_MAJOR_MASK)
+#define GET_IOAT_VER_MINOR(x)                  ((x) & IOAT_VER_MINOR_MASK)
+
+#define IOAT_PERPORTOFFSET_OFFSET              0x0A    /* 16-bit */
+
+#define IOAT_INTRDELAY_OFFSET                  0x0C    /* 16-bit */
+#define IOAT_INTRDELAY_INT_DELAY_MASK          0x3FFF  /* Interrupt Delay Time */
+#define IOAT_INTRDELAY_COALESE_SUPPORT         0x8000  /* Interrupt Coalesing Supported */
+
+#define IOAT_DEVICE_STATUS_OFFSET              0x0E    /* 16-bit */
+#define IOAT_DEVICE_STATUS_DEGRADED_MODE       0x0001
+
+
+#define IOAT_CHANNEL_MMIO_SIZE                 0x80    /* Each Channel MMIO space is this size */
+
+/* DMA Channel Registers */
+#define IOAT_CHANCTRL_OFFSET                   0x00    /* 16-bit Channel Control Register */
+#define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK    0xF000
+#define IOAT_CHANCTRL_CHANNEL_IN_USE           0x0100
+#define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL    0x0020
+#define IOAT_CHANCTRL_ERR_INT_EN               0x0010
+#define IOAT_CHANCTRL_ANY_ERR_ABORT_EN         0x0008
+#define IOAT_CHANCTRL_ERR_COMPLETION_EN                0x0004
+#define IOAT_CHANCTRL_INT_DISABLE              0x0001
+
+#define IOAT_DMA_COMP_OFFSET                   0x02    /* 16-bit DMA channel compatability */
+#define IOAT_DMA_COMP_V1                       0x0001  /* Compatability with DMA version 1 */
+
+#define IOAT_CHANSTS_OFFSET                    0x04    /* 64-bit Channel Status Register */
+#define IOAT_CHANSTS_OFFSET_LOW                        0x04
+#define IOAT_CHANSTS_OFFSET_HIGH               0x08
+#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR 0xFFFFFFFFFFFFFFC0
+#define IOAT_CHANSTS_SOFT_ERR                  0x0000000000000010
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS       0x0000000000000007
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE        0x0
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE  0x1
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_SUSPENDED     0x2
+#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED        0x3
+
+#define IOAT_CHAINADDR_OFFSET                  0x0C    /* 64-bit Descriptor Chain Address Register */
+#define IOAT_CHAINADDR_OFFSET_LOW              0x0C
+#define IOAT_CHAINADDR_OFFSET_HIGH             0x10
+
+#define IOAT_CHANCMD_OFFSET                    0x14    /*  8-bit DMA Channel Command Register */
+#define IOAT_CHANCMD_RESET                     0x20
+#define IOAT_CHANCMD_RESUME                    0x10
+#define IOAT_CHANCMD_ABORT                     0x08
+#define IOAT_CHANCMD_SUSPEND                   0x04
+#define IOAT_CHANCMD_APPEND                    0x02
+#define IOAT_CHANCMD_START                     0x01
+
+#define IOAT_CHANCMP_OFFSET                    0x18    /* 64-bit Channel Completion Address Register */
+#define IOAT_CHANCMP_OFFSET_LOW                        0x18
+#define IOAT_CHANCMP_OFFSET_HIGH               0x1C
+
+#define IOAT_CDAR_OFFSET                       0x20    /* 64-bit Current Descriptor Address Register */
+#define IOAT_CDAR_OFFSET_LOW                   0x20
+#define IOAT_CDAR_OFFSET_HIGH                  0x24
+
+#define IOAT_CHANERR_OFFSET                    0x28    /* 32-bit Channel Error Register */
+#define IOAT_CHANERR_DMA_TRANSFER_SRC_ADDR_ERR 0x0001
+#define IOAT_CHANERR_DMA_TRANSFER_DEST_ADDR_ERR        0x0002
+#define IOAT_CHANERR_NEXT_DESCRIPTOR_ADDR_ERR  0x0004
+#define IOAT_CHANERR_NEXT_DESCRIPTOR_ALIGNMENT_ERR     0x0008
+#define IOAT_CHANERR_CHAIN_ADDR_VALUE_ERR      0x0010
+#define IOAT_CHANERR_CHANCMD_ERR               0x0020
+#define IOAT_CHANERR_CHIPSET_UNCORRECTABLE_DATA_INTEGRITY_ERR  0x0040
+#define IOAT_CHANERR_DMA_UNCORRECTABLE_DATA_INTEGRITY_ERR      0x0080
+#define IOAT_CHANERR_READ_DATA_ERR             0x0100
+#define IOAT_CHANERR_WRITE_DATA_ERR            0x0200
+#define IOAT_CHANERR_DESCRIPTOR_CONTROL_ERR    0x0400
+#define IOAT_CHANERR_DESCRIPTOR_LENGTH_ERR     0x0800
+#define IOAT_CHANERR_COMPLETION_ADDR_ERR       0x1000
+#define IOAT_CHANERR_INT_CONFIGURATION_ERR     0x2000
+#define IOAT_CHANERR_SOFT_ERR                  0x4000
+
+#define IOAT_CHANERR_MASK_OFFSET               0x2C    /* 32-bit Channel Error Register */
+
+#endif /* _IOAT_REGISTERS_H_ */
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
new file mode 100644 (file)
index 0000000..5ed327e
--- /dev/null
@@ -0,0 +1,301 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include <linux/dmaengine.h>
+#include <linux/pagemap.h>
+#include <net/tcp.h> /* for memcpy_toiovec */
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+int num_pages_spanned(struct iovec *iov)
+{
+       return
+       ((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
+       ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/*
+ * Pin down all the iovec pages needed for len bytes.
+ * Return a struct dma_pinned_list to keep track of pages pinned down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len)
+{
+       struct dma_pinned_list *local_list;
+       struct page **pages;
+       int i;
+       int ret;
+       int nr_iovecs = 0;
+       int iovec_len_used = 0;
+       int iovec_pages_used = 0;
+       long err;
+
+       /* don't pin down non-user-based iovecs */
+       if (segment_eq(get_fs(), KERNEL_DS))
+               return NULL;
+
+       /* determine how many iovecs/pages there are, up front */
+       do {
+               iovec_len_used += iov[nr_iovecs].iov_len;
+               iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]);
+               nr_iovecs++;
+       } while (iovec_len_used < len);
+
+       /* single kmalloc for pinned list, page_list[], and the page arrays */
+       local_list = kmalloc(sizeof(*local_list)
+               + (nr_iovecs * sizeof (struct dma_page_list))
+               + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+       if (!local_list) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       /* list of pages starts right after the page list array */
+       pages = (struct page **) &local_list->page_list[nr_iovecs];
+
+       for (i = 0; i < nr_iovecs; i++) {
+               struct dma_page_list *page_list = &local_list->page_list[i];
+
+               len -= iov[i].iov_len;
+
+               if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
+                       err = -EFAULT;
+                       goto unpin;
+               }
+
+               page_list->nr_pages = num_pages_spanned(&iov[i]);
+               page_list->base_address = iov[i].iov_base;
+
+               page_list->pages = pages;
+               pages += page_list->nr_pages;
+
+               /* pin pages down */
+               down_read(&current->mm->mmap_sem);
+               ret = get_user_pages(
+                       current,
+                       current->mm,
+                       (unsigned long) iov[i].iov_base,
+                       page_list->nr_pages,
+                       1,      /* write */
+                       0,      /* force */
+                       page_list->pages,
+                       NULL);
+               up_read(&current->mm->mmap_sem);
+
+               if (ret != page_list->nr_pages) {
+                       err = -ENOMEM;
+                       goto unpin;
+               }
+
+               local_list->nr_iovecs = i + 1;
+       }
+
+       return local_list;
+
+unpin:
+       dma_unpin_iovec_pages(local_list);
+out:
+       return ERR_PTR(err);
+}
+
+void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list)
+{
+       int i, j;
+
+       if (!pinned_list)
+               return;
+
+       for (i = 0; i < pinned_list->nr_iovecs; i++) {
+               struct dma_page_list *page_list = &pinned_list->page_list[i];
+               for (j = 0; j < page_list->nr_pages; j++) {
+                       set_page_dirty_lock(page_list->pages[j]);
+                       page_cache_release(page_list->pages[j]);
+               }
+       }
+
+       kfree(pinned_list);
+}
+
+static dma_cookie_t dma_memcpy_to_kernel_iovec(struct dma_chan *chan, struct
+       iovec *iov, unsigned char *kdata, size_t len)
+{
+       dma_cookie_t dma_cookie = 0;
+
+       while (len > 0) {
+               if (iov->iov_len) {
+                       int copy = min_t(unsigned int, iov->iov_len, len);
+                       dma_cookie = dma_async_memcpy_buf_to_buf(
+                                       chan,
+                                       iov->iov_base,
+                                       kdata,
+                                       copy);
+                       kdata += copy;
+                       len -= copy;
+                       iov->iov_len -= copy;
+                       iov->iov_base += copy;
+               }
+               iov++;
+       }
+
+       return dma_cookie;
+}
+
+/*
+ * We have already pinned down the pages we will be using in the iovecs.
+ * Each entry in iov array has corresponding entry in pinned_list->page_list.
+ * Using array indexing to keep iov[] and page_list[] in sync.
+ * Initial elements in iov array's iov->iov_len will be 0 if already copied into
+ *   by another call.
+ * iov array length remaining guaranteed to be bigger than len.
+ */
+dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov,
+       struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len)
+{
+       int iov_byte_offset;
+       int copy;
+       dma_cookie_t dma_cookie = 0;
+       int iovec_idx;
+       int page_idx;
+
+       if (!chan)
+               return memcpy_toiovec(iov, kdata, len);
+
+       /* -> kernel copies (e.g. smbfs) */
+       if (!pinned_list)
+               return dma_memcpy_to_kernel_iovec(chan, iov, kdata, len);
+
+       iovec_idx = 0;
+       while (iovec_idx < pinned_list->nr_iovecs) {
+               struct dma_page_list *page_list;
+
+               /* skip already used-up iovecs */
+               while (!iov[iovec_idx].iov_len)
+                       iovec_idx++;
+
+               page_list = &pinned_list->page_list[iovec_idx];
+
+               iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+               page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+                        - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+               /* break up copies to not cross page boundary */
+               while (iov[iovec_idx].iov_len) {
+                       copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+                       copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+                       dma_cookie = dma_async_memcpy_buf_to_pg(chan,
+                                       page_list->pages[page_idx],
+                                       iov_byte_offset,
+                                       kdata,
+                                       copy);
+
+                       len -= copy;
+                       iov[iovec_idx].iov_len -= copy;
+                       iov[iovec_idx].iov_base += copy;
+
+                       if (!len)
+                               return dma_cookie;
+
+                       kdata += copy;
+                       iov_byte_offset = 0;
+                       page_idx++;
+               }
+               iovec_idx++;
+       }
+
+       /* really bad if we ever run out of iovecs */
+       BUG();
+       return -EFAULT;
+}
+
+dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov,
+       struct dma_pinned_list *pinned_list, struct page *page,
+       unsigned int offset, size_t len)
+{
+       int iov_byte_offset;
+       int copy;
+       dma_cookie_t dma_cookie = 0;
+       int iovec_idx;
+       int page_idx;
+       int err;
+
+       /* this needs as-yet-unimplemented buf-to-buff, so punt. */
+       /* TODO: use dma for this */
+       if (!chan || !pinned_list) {
+               u8 *vaddr = kmap(page);
+               err = memcpy_toiovec(iov, vaddr + offset, len);
+               kunmap(page);
+               return err;
+       }
+
+       iovec_idx = 0;
+       while (iovec_idx < pinned_list->nr_iovecs) {
+               struct dma_page_list *page_list;
+
+               /* skip already used-up iovecs */
+               while (!iov[iovec_idx].iov_len)
+                       iovec_idx++;
+
+               page_list = &pinned_list->page_list[iovec_idx];
+
+               iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK);
+               page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK)
+                        - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT;
+
+               /* break up copies to not cross page boundary */
+               while (iov[iovec_idx].iov_len) {
+                       copy = min_t(int, PAGE_SIZE - iov_byte_offset, len);
+                       copy = min_t(int, copy, iov[iovec_idx].iov_len);
+
+                       dma_cookie = dma_async_memcpy_pg_to_pg(chan,
+                                       page_list->pages[page_idx],
+                                       iov_byte_offset,
+                                       page,
+                                       offset,
+                                       copy);
+
+                       len -= copy;
+                       iov[iovec_idx].iov_len -= copy;
+                       iov[iovec_idx].iov_base += copy;
+
+                       if (!len)
+                               return dma_cookie;
+
+                       offset += copy;
+                       iov_byte_offset = 0;
+                       page_idx++;
+               }
+               iovec_idx++;
+       }
+
+       /* really bad if we ever run out of iovecs */
+       BUG();
+       return -EFAULT;
+}
index afc612b..ba2d650 100644 (file)
@@ -29,6 +29,11 @@ config INFINIBAND_USER_ACCESS
          libibverbs, libibcm and a hardware driver library from
          <http://www.openib.org>.
 
+config INFINIBAND_ADDR_TRANS
+       bool
+       depends on INFINIBAND && INET
+       default y
+
 source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/ipath/Kconfig"
 
index ec3353f..68e73ec 100644 (file)
@@ -1,5 +1,7 @@
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)     := ib_addr.o rdma_cm.o
+
 obj-$(CONFIG_INFINIBAND) +=            ib_core.o ib_mad.o ib_sa.o \
-                                       ib_cm.o
+                                       ib_cm.o $(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=   ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=        ib_uverbs.o ib_ucm.o
 
@@ -12,8 +14,13 @@ ib_sa-y :=                   sa_query.o
 
 ib_cm-y :=                     cm.o
 
+rdma_cm-y :=                   cma.o
+
+ib_addr-y :=                   addr.o
+
 ib_umad-y :=                   user_mad.o
 
 ib_ucm-y :=                    ucm.o
 
-ib_uverbs-y :=                 uverbs_main.o uverbs_cmd.o uverbs_mem.o
+ib_uverbs-y :=                 uverbs_main.o uverbs_cmd.o uverbs_mem.o \
+                               uverbs_marshall.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
new file mode 100644 (file)
index 0000000..d294bbc
--- /dev/null
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This Software is licensed under one of the following licenses:
+ *
+ * 1) under the terms of the "Common Public License 1.0" a copy of which is
+ *    available from the Open Source Initiative, see
+ *    http://www.opensource.org/licenses/cpl.php.
+ *
+ * 2) under the terms of the "The BSD License" a copy of which is
+ *    available from the Open Source Initiative, see
+ *    http://www.opensource.org/licenses/bsd-license.php.
+ *
+ * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+ *    copy of which is available from the Open Source Initiative, see
+ *    http://www.opensource.org/licenses/gpl-license.php.
+ *
+ * Licensee has the right to choose one of the above licenses.
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice and one of the license notices.
+ *
+ * Redistributions in binary form must reproduce both the above copyright
+ * notice, one of the license notices in the documentation
+ * and/or other materials provided with the distribution.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
+#include <linux/if_arp.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <rdma/ib_addr.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("IB Address Translation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct addr_req {
+       struct list_head list;
+       struct sockaddr src_addr;
+       struct sockaddr dst_addr;
+       struct rdma_dev_addr *addr;
+       void *context;
+       void (*callback)(int status, struct sockaddr *src_addr,
+                        struct rdma_dev_addr *addr, void *context);
+       unsigned long timeout;
+       int status;
+};
+
+static void process_req(void *data);
+
+static DEFINE_MUTEX(lock);
+static LIST_HEAD(req_list);
+static DECLARE_WORK(work, process_req, NULL);
+static struct workqueue_struct *addr_wq;
+
+static int copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+                    unsigned char *dst_dev_addr)
+{
+       switch (dev->type) {
+       case ARPHRD_INFINIBAND:
+               dev_addr->dev_type = IB_NODE_CA;
+               break;
+       default:
+               return -EADDRNOTAVAIL;
+       }
+
+       memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+       memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+       if (dst_dev_addr)
+               memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+       return 0;
+}
+
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+       struct net_device *dev;
+       u32 ip = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+       int ret;
+
+       dev = ip_dev_find(ip);
+       if (!dev)
+               return -EADDRNOTAVAIL;
+
+       ret = copy_addr(dev_addr, dev, NULL);
+       dev_put(dev);
+       return ret;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(unsigned long time)
+{
+       unsigned long delay;
+
+       cancel_delayed_work(&work);
+
+       delay = time - jiffies;
+       if ((long)delay <= 0)
+               delay = 1;
+
+       queue_delayed_work(addr_wq, &work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+       struct addr_req *temp_req;
+
+       mutex_lock(&lock);
+       list_for_each_entry_reverse(temp_req, &req_list, list) {
+               if (time_after(req->timeout, temp_req->timeout))
+                       break;
+       }
+
+       list_add(&req->list, &temp_req->list);
+
+       if (req_list.next == &req->list)
+               set_timeout(req->timeout);
+       mutex_unlock(&lock);
+}
+
+static void addr_send_arp(struct sockaddr_in *dst_in)
+{
+       struct rtable *rt;
+       struct flowi fl;
+       u32 dst_ip = dst_in->sin_addr.s_addr;
+
+       memset(&fl, 0, sizeof fl);
+       fl.nl_u.ip4_u.daddr = dst_ip;
+       if (ip_route_output_key(&rt, &fl))
+               return;
+
+       arp_send(ARPOP_REQUEST, ETH_P_ARP, rt->rt_gateway, rt->idev->dev,
+                rt->rt_src, NULL, rt->idev->dev->dev_addr, NULL);
+       ip_rt_put(rt);
+}
+
+static int addr_resolve_remote(struct sockaddr_in *src_in,
+                              struct sockaddr_in *dst_in,
+                              struct rdma_dev_addr *addr)
+{
+       u32 src_ip = src_in->sin_addr.s_addr;
+       u32 dst_ip = dst_in->sin_addr.s_addr;
+       struct flowi fl;
+       struct rtable *rt;
+       struct neighbour *neigh;
+       int ret;
+
+       memset(&fl, 0, sizeof fl);
+       fl.nl_u.ip4_u.daddr = dst_ip;
+       fl.nl_u.ip4_u.saddr = src_ip;
+       ret = ip_route_output_key(&rt, &fl);
+       if (ret)
+               goto out;
+
+       /* If the device does ARP internally, return 'done' */
+       if (rt->idev->dev->flags & IFF_NOARP) {
+               copy_addr(addr, rt->idev->dev, NULL);
+               goto put;
+       }
+
+       neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
+       if (!neigh) {
+               ret = -ENODATA;
+               goto put;
+       }
+
+       if (!(neigh->nud_state & NUD_VALID)) {
+               ret = -ENODATA;
+               goto release;
+       }
+
+       if (!src_ip) {
+               src_in->sin_family = dst_in->sin_family;
+               src_in->sin_addr.s_addr = rt->rt_src;
+       }
+
+       ret = copy_addr(addr, neigh->dev, neigh->ha);
+release:
+       neigh_release(neigh);
+put:
+       ip_rt_put(rt);
+out:
+       return ret;
+}
+
+static void process_req(void *data)
+{
+       struct addr_req *req, *temp_req;
+       struct sockaddr_in *src_in, *dst_in;
+       struct list_head done_list;
+
+       INIT_LIST_HEAD(&done_list);
+
+       mutex_lock(&lock);
+       list_for_each_entry_safe(req, temp_req, &req_list, list) {
+               if (req->status) {
+                       src_in = (struct sockaddr_in *) &req->src_addr;
+                       dst_in = (struct sockaddr_in *) &req->dst_addr;
+                       req->status = addr_resolve_remote(src_in, dst_in,
+                                                         req->addr);
+               }
+               if (req->status && time_after(jiffies, req->timeout))
+                       req->status = -ETIMEDOUT;
+               else if (req->status == -ENODATA)
+                       continue;
+
+               list_del(&req->list);
+               list_add_tail(&req->list, &done_list);
+       }
+
+       if (!list_empty(&req_list)) {
+               req = list_entry(req_list.next, struct addr_req, list);
+               set_timeout(req->timeout);
+       }
+       mutex_unlock(&lock);
+
+       list_for_each_entry_safe(req, temp_req, &done_list, list) {
+               list_del(&req->list);
+               req->callback(req->status, &req->src_addr, req->addr,
+                             req->context);
+               kfree(req);
+       }
+}
+
+static int addr_resolve_local(struct sockaddr_in *src_in,
+                             struct sockaddr_in *dst_in,
+                             struct rdma_dev_addr *addr)
+{
+       struct net_device *dev;
+       u32 src_ip = src_in->sin_addr.s_addr;
+       u32 dst_ip = dst_in->sin_addr.s_addr;
+       int ret;
+
+       dev = ip_dev_find(dst_ip);
+       if (!dev)
+               return -EADDRNOTAVAIL;
+
+       if (ZERONET(src_ip)) {
+               src_in->sin_family = dst_in->sin_family;
+               src_in->sin_addr.s_addr = dst_ip;
+               ret = copy_addr(addr, dev, dev->dev_addr);
+       } else if (LOOPBACK(src_ip)) {
+               ret = rdma_translate_ip((struct sockaddr *)dst_in, addr);
+               if (!ret)
+                       memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+       } else {
+               ret = rdma_translate_ip((struct sockaddr *)src_in, addr);
+               if (!ret)
+                       memcpy(addr->dst_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+       }
+
+       dev_put(dev);
+       return ret;
+}
+
+int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr,
+                   struct rdma_dev_addr *addr, int timeout_ms,
+                   void (*callback)(int status, struct sockaddr *src_addr,
+                                    struct rdma_dev_addr *addr, void *context),
+                   void *context)
+{
+       struct sockaddr_in *src_in, *dst_in;
+       struct addr_req *req;
+       int ret = 0;
+
+       req = kmalloc(sizeof *req, GFP_KERNEL);
+       if (!req)
+               return -ENOMEM;
+       memset(req, 0, sizeof *req);
+
+       if (src_addr)
+               memcpy(&req->src_addr, src_addr, ip_addr_size(src_addr));
+       memcpy(&req->dst_addr, dst_addr, ip_addr_size(dst_addr));
+       req->addr = addr;
+       req->callback = callback;
+       req->context = context;
+
+       src_in = (struct sockaddr_in *) &req->src_addr;
+       dst_in = (struct sockaddr_in *) &req->dst_addr;
+
+       req->status = addr_resolve_local(src_in, dst_in, addr);
+       if (req->status == -EADDRNOTAVAIL)
+               req->status = addr_resolve_remote(src_in, dst_in, addr);
+
+       switch (req->status) {
+       case 0:
+               req->timeout = jiffies;
+               queue_req(req);
+               break;
+       case -ENODATA:
+               req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+               queue_req(req);
+               addr_send_arp(dst_in);
+               break;
+       default:
+               ret = req->status;
+               kfree(req);
+               break;
+       }
+       return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+       struct addr_req *req, *temp_req;
+
+       mutex_lock(&lock);
+       list_for_each_entry_safe(req, temp_req, &req_list, list) {
+               if (req->addr == addr) {
+                       req->status = -ECANCELED;
+                       req->timeout = jiffies;
+                       list_del(&req->list);
+                       list_add(&req->list, &req_list);
+                       set_timeout(req->timeout);
+                       break;
+               }
+       }
+       mutex_unlock(&lock);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+static int addr_arp_recv(struct sk_buff *skb, struct net_device *dev,
+                        struct packet_type *pkt, struct net_device *orig_dev)
+{
+       struct arphdr *arp_hdr;
+
+       arp_hdr = (struct arphdr *) skb->nh.raw;
+
+       if (arp_hdr->ar_op == htons(ARPOP_REQUEST) ||
+           arp_hdr->ar_op == htons(ARPOP_REPLY))
+               set_timeout(jiffies);
+
+       kfree_skb(skb);
+       return 0;
+}
+
+static struct packet_type addr_arp = {
+       .type           = __constant_htons(ETH_P_ARP),
+       .func           = addr_arp_recv,
+       .af_packet_priv = (void*) 1,
+};
+
+static int addr_init(void)
+{
+       addr_wq = create_singlethread_workqueue("ib_addr_wq");
+       if (!addr_wq)
+               return -ENOMEM;
+
+       dev_add_pack(&addr_arp);
+       return 0;
+}
+
+static void addr_cleanup(void)
+{
+       dev_remove_pack(&addr_arp);
+       destroy_workqueue(addr_wq);
+}
+
+module_init(addr_init);
+module_exit(addr_cleanup);
index 50364c0..e05ca2c 100644 (file)
@@ -191,6 +191,24 @@ int ib_find_cached_pkey(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_find_cached_pkey);
 
+int ib_get_cached_lmc(struct ib_device *device,
+                     u8                port_num,
+                     u8                *lmc)
+{
+       unsigned long flags;
+       int ret = 0;
+
+       if (port_num < start_port(device) || port_num > end_port(device))
+               return -EINVAL;
+
+       read_lock_irqsave(&device->cache.lock, flags);
+       *lmc = device->cache.lmc_cache[port_num - start_port(device)];
+       read_unlock_irqrestore(&device->cache.lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
 static void ib_cache_update(struct ib_device *device,
                            u8                port)
 {
@@ -251,6 +269,8 @@ static void ib_cache_update(struct ib_device *device,
        device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
        device->cache.gid_cache [port - start_port(device)] = gid_cache;
 
+       device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
+
        write_unlock_irq(&device->cache.lock);
 
        kfree(old_pkey_cache);
@@ -305,7 +325,13 @@ static void ib_cache_setup_one(struct ib_device *device)
                kmalloc(sizeof *device->cache.gid_cache *
                        (end_port(device) - start_port(device) + 1), GFP_KERNEL);
 
-       if (!device->cache.pkey_cache || !device->cache.gid_cache) {
+       device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
+                                         (end_port(device) -
+                                          start_port(device) + 1),
+                                         GFP_KERNEL);
+
+       if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+           !device->cache.lmc_cache) {
                printk(KERN_WARNING "Couldn't allocate cache "
                       "for %s\n", device->name);
                goto err;
@@ -333,6 +359,7 @@ err_cache:
 err:
        kfree(device->cache.pkey_cache);
        kfree(device->cache.gid_cache);
+       kfree(device->cache.lmc_cache);
 }
 
 static void ib_cache_cleanup_one(struct ib_device *device)
@@ -349,6 +376,7 @@ static void ib_cache_cleanup_one(struct ib_device *device)
 
        kfree(device->cache.pkey_cache);
        kfree(device->cache.gid_cache);
+       kfree(device->cache.lmc_cache);
 }
 
 static struct ib_client cache_client = {
index 86fee43..450adfe 100644 (file)
@@ -32,7 +32,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
- * $Id: cm.c 2821 2005-07-08 17:07:28Z sean.hefty $
+ * $Id: cm.c 4311 2005-12-05 18:42:01Z sean.hefty $
  */
 
 #include <linux/completion.h>
@@ -132,6 +132,7 @@ struct cm_id_private {
        /* todo: use alternate port on send failure */
        struct cm_av av;
        struct cm_av alt_av;
+       struct ib_cm_compare_data *compare_data;
 
        void *private_data;
        __be64 tid;
@@ -253,23 +254,13 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv,
        cm_id_priv->private_data_len = private_data_len;
 }
 
-static void cm_set_ah_attr(struct ib_ah_attr *ah_attr, u8 port_num,
-                          u16 dlid, u8 sl, u16 src_path_bits)
-{
-       memset(ah_attr, 0, sizeof ah_attr);
-       ah_attr->dlid = dlid;
-       ah_attr->sl = sl;
-       ah_attr->src_path_bits = src_path_bits;
-       ah_attr->port_num = port_num;
-}
-
-static void cm_init_av_for_response(struct cm_port *port,
-                                   struct ib_wc *wc, struct cm_av *av)
+static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+                                   struct ib_grh *grh, struct cm_av *av)
 {
        av->port = port;
        av->pkey_index = wc->pkey_index;
-       cm_set_ah_attr(&av->ah_attr, port->port_num, wc->slid,
-                      wc->sl, wc->dlid_path_bits);
+       ib_init_ah_from_wc(port->cm_dev->device, port->port_num, wc,
+                          grh, &av->ah_attr);
 }
 
 static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
@@ -299,9 +290,8 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
                return ret;
 
        av->port = port;
-       cm_set_ah_attr(&av->ah_attr, av->port->port_num,
-                      be16_to_cpu(path->dlid), path->sl,
-                      be16_to_cpu(path->slid) & 0x7F);
+       ib_init_ah_from_path(cm_dev->device, port->port_num, path,
+                            &av->ah_attr);
        av->packet_life_time = path->packet_life_time;
        return 0;
 }
@@ -357,6 +347,41 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
        return cm_id_priv;
 }
 
+static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask)
+{
+       int i;
+
+       for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++)
+               ((unsigned long *) dst)[i] = ((unsigned long *) src)[i] &
+                                            ((unsigned long *) mask)[i];
+}
+
+static int cm_compare_data(struct ib_cm_compare_data *src_data,
+                          struct ib_cm_compare_data *dst_data)
+{
+       u8 src[IB_CM_COMPARE_SIZE];
+       u8 dst[IB_CM_COMPARE_SIZE];
+
+       if (!src_data || !dst_data)
+               return 0;
+
+       cm_mask_copy(src, src_data->data, dst_data->mask);
+       cm_mask_copy(dst, dst_data->data, src_data->mask);
+       return memcmp(src, dst, IB_CM_COMPARE_SIZE);
+}
+
+static int cm_compare_private_data(u8 *private_data,
+                                  struct ib_cm_compare_data *dst_data)
+{
+       u8 src[IB_CM_COMPARE_SIZE];
+
+       if (!dst_data)
+               return 0;
+
+       cm_mask_copy(src, private_data, dst_data->mask);
+       return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE);
+}
+
 static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 {
        struct rb_node **link = &cm.listen_service_table.rb_node;
@@ -364,14 +389,18 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
        struct cm_id_private *cur_cm_id_priv;
        __be64 service_id = cm_id_priv->id.service_id;
        __be64 service_mask = cm_id_priv->id.service_mask;
+       int data_cmp;
 
        while (*link) {
                parent = *link;
                cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
                                          service_node);
+               data_cmp = cm_compare_data(cm_id_priv->compare_data,
+                                          cur_cm_id_priv->compare_data);
                if ((cur_cm_id_priv->id.service_mask & service_id) ==
                    (service_mask & cur_cm_id_priv->id.service_id) &&
-                   (cm_id_priv->id.device == cur_cm_id_priv->id.device))
+                   (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
+                   !data_cmp)
                        return cur_cm_id_priv;
 
                if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
@@ -380,6 +409,10 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
                        link = &(*link)->rb_right;
                else if (service_id < cur_cm_id_priv->id.service_id)
                        link = &(*link)->rb_left;
+               else if (service_id > cur_cm_id_priv->id.service_id)
+                       link = &(*link)->rb_right;
+               else if (data_cmp < 0)
+                       link = &(*link)->rb_left;
                else
                        link = &(*link)->rb_right;
        }
@@ -389,16 +422,20 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 }
 
 static struct cm_id_private * cm_find_listen(struct ib_device *device,
-                                            __be64 service_id)
+                                            __be64 service_id,
+                                            u8 *private_data)
 {
        struct rb_node *node = cm.listen_service_table.rb_node;
        struct cm_id_private *cm_id_priv;
+       int data_cmp;
 
        while (node) {
                cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+               data_cmp = cm_compare_private_data(private_data,
+                                                  cm_id_priv->compare_data);
                if ((cm_id_priv->id.service_mask & service_id) ==
                     cm_id_priv->id.service_id &&
-                   (cm_id_priv->id.device == device))
+                   (cm_id_priv->id.device == device) && !data_cmp)
                        return cm_id_priv;
 
                if (device < cm_id_priv->id.device)
@@ -407,6 +444,10 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
                        node = node->rb_right;
                else if (service_id < cm_id_priv->id.service_id)
                        node = node->rb_left;
+               else if (service_id > cm_id_priv->id.service_id)
+                       node = node->rb_right;
+               else if (data_cmp < 0)
+                       node = node->rb_left;
                else
                        node = node->rb_right;
        }
@@ -730,15 +771,14 @@ retest:
        wait_for_completion(&cm_id_priv->comp);
        while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
                cm_free_work(work);
-       if (cm_id_priv->private_data && cm_id_priv->private_data_len)
-               kfree(cm_id_priv->private_data);
+       kfree(cm_id_priv->compare_data);
+       kfree(cm_id_priv->private_data);
        kfree(cm_id_priv);
 }
 EXPORT_SYMBOL(ib_destroy_cm_id);
 
-int ib_cm_listen(struct ib_cm_id *cm_id,
-                __be64 service_id,
-                __be64 service_mask)
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+                struct ib_cm_compare_data *compare_data)
 {
        struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
        unsigned long flags;
@@ -752,7 +792,19 @@ int ib_cm_listen(struct ib_cm_id *cm_id,
                return -EINVAL;
 
        cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-       BUG_ON(cm_id->state != IB_CM_IDLE);
+       if (cm_id->state != IB_CM_IDLE)
+               return -EINVAL;
+
+       if (compare_data) {
+               cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
+                                                  GFP_KERNEL);
+               if (!cm_id_priv->compare_data)
+                       return -ENOMEM;
+               cm_mask_copy(cm_id_priv->compare_data->data,
+                            compare_data->data, compare_data->mask);
+               memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
+                      IB_CM_COMPARE_SIZE);
+       }
 
        cm_id->state = IB_CM_LISTEN;
 
@@ -769,6 +821,8 @@ int ib_cm_listen(struct ib_cm_id *cm_id,
 
        if (cur_cm_id_priv) {
                cm_id->state = IB_CM_IDLE;
+               kfree(cm_id_priv->compare_data);
+               cm_id_priv->compare_data = NULL;
                ret = -EBUSY;
        }
        return ret;
@@ -1241,7 +1295,8 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
 
        /* Find matching listen request. */
        listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
-                                          req_msg->service_id);
+                                          req_msg->service_id,
+                                          req_msg->private_data);
        if (!listen_cm_id_priv) {
                spin_unlock_irqrestore(&cm.lock, flags);
                cm_issue_rej(work->port, work->mad_recv_wc,
@@ -1276,6 +1331,7 @@ static int cm_req_handler(struct cm_work *work)
        cm_id_priv = container_of(cm_id, struct cm_id_private, id);
        cm_id_priv->id.remote_id = req_msg->local_comm_id;
        cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+                               work->mad_recv_wc->recv_buf.grh,
                                &cm_id_priv->av);
        cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
                                                            id.local_id);
@@ -2549,7 +2605,7 @@ static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
        cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
                          cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR));
        sidr_req_msg->request_id = cm_id_priv->id.local_id;
-       sidr_req_msg->pkey = cpu_to_be16(param->pkey);
+       sidr_req_msg->pkey = cpu_to_be16(param->path->pkey);
        sidr_req_msg->service_id = param->service_id;
 
        if (param->private_data && param->private_data_len)
@@ -2641,6 +2697,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
        cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
        cm_id_priv->av.dgid.global.interface_id = 0;
        cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+                               work->mad_recv_wc->recv_buf.grh,
                                &cm_id_priv->av);
        cm_id_priv->id.remote_id = sidr_req_msg->request_id;
        cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
@@ -2654,7 +2711,8 @@ static int cm_sidr_req_handler(struct cm_work *work)
                goto out; /* Duplicate message. */
        }
        cur_cm_id_priv = cm_find_listen(cm_id->device,
-                                       sidr_req_msg->service_id);
+                                       sidr_req_msg->service_id,
+                                       sidr_req_msg->private_data);
        if (!cur_cm_id_priv) {
                rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
                spin_unlock_irqrestore(&cm.lock, flags);
@@ -3291,7 +3349,6 @@ error:
 
 static void __exit ib_cm_cleanup(void)
 {
-       flush_workqueue(cm.wq);
        destroy_workqueue(cm.wq);
        ib_unregister_client(&cm_client);
        idr_destroy(&cm.local_id_table);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
new file mode 100644 (file)
index 0000000..a76834e
--- /dev/null
@@ -0,0 +1,1927 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This Software is licensed under one of the following licenses:
+ *
+ * 1) under the terms of the "Common Public License 1.0" a copy of which is
+ *    available from the Open Source Initiative, see
+ *    http://www.opensource.org/licenses/cpl.php.
+ *
+ * 2) under the terms of the "The BSD License" a copy of which is
+ *    available from the Open Source Initiative, see
+ *    http://www.opensource.org/licenses/bsd-license.php.
+ *
+ * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+ *    copy of which is available from the Open Source Initiative, see
+ *    http://www.opensource.org/licenses/gpl-license.php.
+ *
+ * Licensee has the right to choose one of the above licenses.
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice and one of the license notices.
+ *
+ * Redistributions in binary form must reproduce both the above copyright
+ * notice, one of the license notices in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/idr.h>
+
+#include <net/tcp.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_MAX_CM_RETRIES 3
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device);
+
+static struct ib_client cma_client = {
+       .name   = "cma",
+       .add    = cma_add_one,
+       .remove = cma_remove_one
+};
+
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct workqueue_struct *cma_wq;
+static DEFINE_IDR(sdp_ps);
+static DEFINE_IDR(tcp_ps);
+
+struct cma_device {
+       struct list_head        list;
+       struct ib_device        *device;
+       __be64                  node_guid;
+       struct completion       comp;
+       atomic_t                refcount;
+       struct list_head        id_list;
+};
+
+enum cma_state {
+       CMA_IDLE,
+       CMA_ADDR_QUERY,
+       CMA_ADDR_RESOLVED,
+       CMA_ROUTE_QUERY,
+       CMA_ROUTE_RESOLVED,
+       CMA_CONNECT,
+       CMA_DISCONNECT,
+       CMA_ADDR_BOUND,
+       CMA_LISTEN,
+       CMA_DEVICE_REMOVAL,
+       CMA_DESTROYING
+};
+
+struct rdma_bind_list {
+       struct idr              *ps;
+       struct hlist_head       owners;
+       unsigned short          port;
+};
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+struct rdma_id_private {
+       struct rdma_cm_id       id;
+
+       struct rdma_bind_list   *bind_list;
+       struct hlist_node       node;
+       struct list_head        list;
+       struct list_head        listen_list;
+       struct cma_device       *cma_dev;
+
+       enum cma_state          state;
+       spinlock_t              lock;
+       struct completion       comp;
+       atomic_t                refcount;
+       wait_queue_head_t       wait_remove;
+       atomic_t                dev_remove;
+
+       int                     backlog;
+       int                     timeout_ms;
+       struct ib_sa_query      *query;
+       int                     query_id;
+       union {
+               struct ib_cm_id *ib;
+       } cm_id;
+
+       u32                     seq_num;
+       u32                     qp_num;
+       enum ib_qp_type         qp_type;
+       u8                      srq;
+};
+
+struct cma_work {
+       struct work_struct      work;
+       struct rdma_id_private  *id;
+       enum cma_state          old_state;
+       enum cma_state          new_state;
+       struct rdma_cm_event    event;
+};
+
+union cma_ip_addr {
+       struct in6_addr ip6;
+       struct {
+               __u32 pad[3];
+               __u32 addr;
+       } ip4;
+};
+
+struct cma_hdr {
+       u8 cma_version;
+       u8 ip_version;  /* IP version: 7:4 */
+       __u16 port;
+       union cma_ip_addr src_addr;
+       union cma_ip_addr dst_addr;
+};
+
+struct sdp_hh {
+       u8 bsdh[16];
+       u8 sdp_version; /* Major version: 7:4 */
+       u8 ip_version;  /* IP version: 7:4 */
+       u8 sdp_specific1[10];
+       __u16 port;
+       __u16 sdp_specific2;
+       union cma_ip_addr src_addr;
+       union cma_ip_addr dst_addr;
+};
+
+struct sdp_hah {
+       u8 bsdh[16];
+       u8 sdp_version;
+};
+
+#define CMA_VERSION 0x00
+#define SDP_MAJ_VERSION 0x2
+
+static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&id_priv->lock, flags);
+       ret = (id_priv->state == comp);
+       spin_unlock_irqrestore(&id_priv->lock, flags);
+       return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+                        enum cma_state comp, enum cma_state exch)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&id_priv->lock, flags);
+       if ((ret = (id_priv->state == comp)))
+               id_priv->state = exch;
+       spin_unlock_irqrestore(&id_priv->lock, flags);
+       return ret;
+}
+
+static enum cma_state cma_exch(struct rdma_id_private *id_priv,
+                              enum cma_state exch)
+{
+       unsigned long flags;
+       enum cma_state old;
+
+       spin_lock_irqsave(&id_priv->lock, flags);
+       old = id_priv->state;
+       id_priv->state = exch;
+       spin_unlock_irqrestore(&id_priv->lock, flags);
+       return old;
+}
+
+static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+{
+       return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+       hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static inline u8 sdp_get_majv(u8 sdp_version)
+{
+       return sdp_version >> 4;
+}
+
+static inline u8 sdp_get_ip_ver(struct sdp_hh *hh)
+{
+       return hh->ip_version >> 4;
+}
+
+static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver)
+{
+       hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+                             struct cma_device *cma_dev)
+{
+       atomic_inc(&cma_dev->refcount);
+       id_priv->cma_dev = cma_dev;
+       id_priv->id.device = cma_dev->device;
+       list_add_tail(&id_priv->list, &cma_dev->id_list);
+}
+
+static inline void cma_deref_dev(struct cma_device *cma_dev)
+{
+       if (atomic_dec_and_test(&cma_dev->refcount))
+               complete(&cma_dev->comp);
+}
+
+static void cma_detach_from_dev(struct rdma_id_private *id_priv)
+{
+       list_del(&id_priv->list);
+       cma_deref_dev(id_priv->cma_dev);
+       id_priv->cma_dev = NULL;
+}
+
+static int cma_acquire_ib_dev(struct rdma_id_private *id_priv)
+{
+       struct cma_device *cma_dev;
+       union ib_gid *gid;
+       int ret = -ENODEV;
+
+       gid = ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr);
+
+       mutex_lock(&lock);
+       list_for_each_entry(cma_dev, &dev_list, list) {
+               ret = ib_find_cached_gid(cma_dev->device, gid,
+                                        &id_priv->id.port_num, NULL);
+               if (!ret) {
+                       cma_attach_to_dev(id_priv, cma_dev);
+                       break;
+               }
+       }
+       mutex_unlock(&lock);
+       return ret;
+}
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv)
+{
+       switch (id_priv->id.route.addr.dev_addr.dev_type) {
+       case IB_NODE_CA:
+               return cma_acquire_ib_dev(id_priv);
+       default:
+               return -ENODEV;
+       }
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+       if (atomic_dec_and_test(&id_priv->refcount))
+               complete(&id_priv->comp);
+}
+
+static void cma_release_remove(struct rdma_id_private *id_priv)
+{
+       if (atomic_dec_and_test(&id_priv->dev_remove))
+               wake_up(&id_priv->wait_remove);
+}
+
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+                                 void *context, enum rdma_port_space ps)
+{
+       struct rdma_id_private *id_priv;
+
+       id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+       if (!id_priv)
+               return ERR_PTR(-ENOMEM);
+
+       id_priv->state = CMA_IDLE;
+       id_priv->id.context = context;
+       id_priv->id.event_handler = event_handler;
+       id_priv->id.ps = ps;
+       spin_lock_init(&id_priv->lock);
+       init_completion(&id_priv->comp);
+       atomic_set(&id_priv->refcount, 1);
+       init_waitqueue_head(&id_priv->wait_remove);
+       atomic_set(&id_priv->dev_remove, 0);
+       INIT_LIST_HEAD(&id_priv->listen_list);
+       get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+
+       return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_create_id);
+
+static int cma_init_ib_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+       struct ib_qp_attr qp_attr;
+       struct rdma_dev_addr *dev_addr;
+       int ret;
+
+       dev_addr = &id_priv->id.route.addr.dev_addr;
+       ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+                                 ib_addr_get_pkey(dev_addr),
+                                 &qp_attr.pkey_index);
+       if (ret)
+               return ret;
+
+       qp_attr.qp_state = IB_QPS_INIT;
+       qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+       qp_attr.port_num = id_priv->id.port_num;
+       return ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+                                         IB_QP_PKEY_INDEX | IB_QP_PORT);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+                  struct ib_qp_init_attr *qp_init_attr)
+{
+       struct rdma_id_private *id_priv;
+       struct ib_qp *qp;
+       int ret;
+
+       id_priv = container_of(id, struct rdma_id_private, id);
+       if (id->device != pd->device)
+               return -EINVAL;
+
+       qp = ib_create_qp(pd, qp_init_attr);
+       if (IS_ERR(qp))
+               return PTR_ERR(qp);
+
+       switch (id->device->node_type) {
+       case IB_NODE_CA:
+               ret = cma_init_ib_qp(id_priv, qp);
+               break;
+       default:
+               ret = -ENOSYS;
+               break;
+       }
+
+       if (ret)
+               goto err;
+
+       id->qp = qp;
+       id_priv->qp_num = qp->qp_num;
+       id_priv->qp_type = qp->qp_type;
+       id_priv->srq = (qp->srq != NULL);
+       return 0;
+err:
+       ib_destroy_qp(qp);
+       return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+       ib_destroy_qp(id->qp);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_cm_id *id)
+{
+       struct ib_qp_attr qp_attr;
+       int qp_attr_mask, ret;
+
+       if (!id->qp)
+               return 0;
+
+       /* Need to update QP attributes from default values. */
+       qp_attr.qp_state = IB_QPS_INIT;
+       ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+       if (ret)
+               return ret;
+
+       ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+       if (ret)
+               return ret;
+
+       qp_attr.qp_state = IB_QPS_RTR;
+       ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+       if (ret)
+               return ret;
+
+       return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+}
+
+static int cma_modify_qp_rts(struct rdma_cm_id *id)
+{
+       struct ib_qp_attr qp_attr;
+       int qp_attr_mask, ret;
+
+       if (!id->qp)
+               return 0;
+
+       qp_attr.qp_state = IB_QPS_RTS;
+       ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+       if (ret)
+               return ret;
+
+       return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+}
+
+static int cma_modify_qp_err(struct rdma_cm_id *id)
+{
+       struct ib_qp_attr qp_attr;
+
+       if (!id->qp)
+               return 0;
+
+       qp_attr.qp_state = IB_QPS_ERR;
+       return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE);
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+                      int *qp_attr_mask)
+{
+       struct rdma_id_private *id_priv;
+       int ret;
+
+       id_priv = container_of(id, struct rdma_id_private, id);
+       switch (id_priv->id.device->node_type) {
+       case IB_NODE_CA:
+               ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+                                        qp_attr_mask);
+               if (qp_attr->qp_state == IB_QPS_RTR)
+                       qp_attr->rq_psn = id_priv->seq_num;
+               break;
+       default:
+               ret = -ENOSYS;
+               break;
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline int cma_zero_addr(struct sockaddr *addr)
+{
+       struct in6_addr *ip6;
+
+       if (addr->sa_family == AF_INET)
+               return ZERONET(((struct sockaddr_in *) addr)->sin_addr.s_addr);
+       else {
+               ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr;
+               return (ip6->s6_addr32[0] | ip6->s6_addr32[1] |
+                       ip6->s6_addr32[3] | ip6->s6_addr32[4]) == 0;
+       }
+}
+
+static inline int cma_loopback_addr(struct sockaddr *addr)
+{
+       return LOOPBACK(((struct sockaddr_in *) addr)->sin_addr.s_addr);
+}
+
+static inline int cma_any_addr(struct sockaddr *addr)
+{
+       return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static inline int cma_any_port(struct sockaddr *addr)
+{
+       return !((struct sockaddr_in *) addr)->sin_port;
+}
+
+static int cma_get_net_info(void *hdr, enum rdma_port_space ps,
+                           u8 *ip_ver, __u16 *port,
+                           union cma_ip_addr **src, union cma_ip_addr **dst)
+{
+       switch (ps) {
+       case RDMA_PS_SDP:
+               if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) !=
+                   SDP_MAJ_VERSION)
+                       return -EINVAL;
+
+               *ip_ver = sdp_get_ip_ver(hdr);
+               *port   = ((struct sdp_hh *) hdr)->port;
+               *src    = &((struct sdp_hh *) hdr)->src_addr;
+               *dst    = &((struct sdp_hh *) hdr)->dst_addr;
+               break;
+       default:
+               if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION)
+                       return -EINVAL;
+
+               *ip_ver = cma_get_ip_ver(hdr);
+               *port   = ((struct cma_hdr *) hdr)->port;
+               *src    = &((struct cma_hdr *) hdr)->src_addr;
+               *dst    = &((struct cma_hdr *) hdr)->dst_addr;
+               break;
+       }
+
+       if (*ip_ver != 4 && *ip_ver != 6)
+               return -EINVAL;
+       return 0;
+}
+
+static void cma_save_net_info(struct rdma_addr *addr,
+                             struct rdma_addr *listen_addr,
+                             u8 ip_ver, __u16 port,
+                             union cma_ip_addr *src, union cma_ip_addr *dst)
+{
+       struct sockaddr_in *listen4, *ip4;
+       struct sockaddr_in6 *listen6, *ip6;
+
+       switch (ip_ver) {
+       case 4:
+               listen4 = (struct sockaddr_in *) &listen_addr->src_addr;
+               ip4 = (struct sockaddr_in *) &addr->src_addr;
+               ip4->sin_family = listen4->sin_family;
+               ip4->sin_addr.s_addr = dst->ip4.addr;
+               ip4->sin_port = listen4->sin_port;
+
+               ip4 = (struct sockaddr_in *) &addr->dst_addr;
+               ip4->sin_family = listen4->sin_family;
+               ip4->sin_addr.s_addr = src->ip4.addr;
+               ip4->sin_port = port;
+               break;
+       case 6:
+               listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr;
+               ip6 = (struct sockaddr_in6 *) &addr->src_addr;
+               ip6->sin6_family = listen6->sin6_family;
+               ip6->sin6_addr = dst->ip6;
+               ip6->sin6_port = listen6->sin6_port;
+
+               ip6 = (struct sockaddr_in6 *) &addr->dst_addr;
+               ip6->sin6_family = listen6->sin6_family;
+               ip6->sin6_addr = src->ip6;
+               ip6->sin6_port = port;
+               break;
+       default:
+               break;
+       }
+}
+
+static inline int cma_user_data_offset(enum rdma_port_space ps)
+{
+       switch (ps) {
+       case RDMA_PS_SDP:
+               return 0;
+       default:
+               return sizeof(struct cma_hdr);
+       }
+}
+
+static int cma_notify_user(struct rdma_id_private *id_priv,
+                          enum rdma_cm_event_type type, int status,
+                          void *data, u8 data_len)
+{
+       struct rdma_cm_event event;
+
+       event.event = type;
+       event.status = status;
+       event.private_data = data;
+       event.private_data_len = data_len;
+
+       return id_priv->id.event_handler(&id_priv->id, &event);
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+       switch (id_priv->id.device->node_type) {
+       case IB_NODE_CA:
+               if (id_priv->query)
+                       ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+               break;
+       default:
+               break;
+       }
+}
+
+static inline int cma_internal_listen(struct rdma_id_private *id_priv)
+{
+       return (id_priv->state == CMA_LISTEN) && id_priv->cma_dev &&
+              cma_any_addr(&id_priv->id.route.addr.src_addr);
+}
+
+static void cma_destroy_listen(struct rdma_id_private *id_priv)
+{
+       cma_exch(id_priv, CMA_DESTROYING);
+
+       if (id_priv->cma_dev) {
+               switch (id_priv->id.device->node_type) {
+               case IB_NODE_CA:
+                       if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
+                               ib_destroy_cm_id(id_priv->cm_id.ib);
+                       break;
+               default:
+                       break;
+               }
+               cma_detach_from_dev(id_priv);
+       }
+       list_del(&id_priv->listen_list);
+
+       cma_deref_id(id_priv);
+       wait_for_completion(&id_priv->comp);
+
+       kfree(id_priv);
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+       struct rdma_id_private *dev_id_priv;
+
+       mutex_lock(&lock);
+       list_del(&id_priv->list);
+
+       while (!list_empty(&id_priv->listen_list)) {
+               dev_id_priv = list_entry(id_priv->listen_list.next,
+                                        struct rdma_id_private, listen_list);
+               cma_destroy_listen(dev_id_priv);
+       }
+       mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+                                enum cma_state state)
+{
+       switch (state) {
+       case CMA_ADDR_QUERY:
+               rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+               break;
+       case CMA_ROUTE_QUERY:
+               cma_cancel_route(id_priv);
+               break;
+       case CMA_LISTEN:
+               if (cma_any_addr(&id_priv->id.route.addr.src_addr) &&
+                   !id_priv->cma_dev)
+                       cma_cancel_listens(id_priv);
+               break;
+       default:
+               break;
+       }
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+       struct rdma_bind_list *bind_list = id_priv->bind_list;
+
+       if (!bind_list)
+               return;
+
+       mutex_lock(&lock);
+       hlist_del(&id_priv->node);
+       if (hlist_empty(&bind_list->owners)) {
+               idr_remove(bind_list->ps, bind_list->port);
+               kfree(bind_list);
+       }
+       mutex_unlock(&lock);
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+       struct rdma_id_private *id_priv;
+       enum cma_state state;
+
+       id_priv = container_of(id, struct rdma_id_private, id);
+       state = cma_exch(id_priv, CMA_DESTROYING);
+       cma_cancel_operation(id_priv, state);
+
+       if (id_priv->cma_dev) {
+               switch (id->device->node_type) {
+               case IB_NODE_CA:
+                       if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
+                               ib_destroy_cm_id(id_priv->cm_id.ib);
+                       break;
+               default:
+                       break;
+               }
+               mutex_lock(&lock);
+               cma_detach_from_dev(id_priv);
+               mutex_unlock(&lock);
+       }
+
+       cma_release_port(id_priv);
+       cma_deref_id(id_priv);
+       wait_for_completion(&id_priv->comp);
+
+       kfree(id_priv->id.route.path_rec);
+       kfree(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+       int ret;
+
+       ret = cma_modify_qp_rtr(&id_priv->id);
+       if (ret)
+               goto reject;
+
+       ret = cma_modify_qp_rts(&id_priv->id);
+       if (ret)
+               goto reject;
+
+       ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+       if (ret)
+               goto reject;
+
+       return 0;
+reject:
+       cma_modify_qp_err(&id_priv->id);
+       ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+                      NULL, 0, NULL, 0);
+       return ret;
+}
+
+static int cma_verify_rep(struct rdma_id_private *id_priv, void *data)
+{
+       if (id_priv->id.ps == RDMA_PS_SDP &&
+           sdp_get_majv(((struct sdp_hah *) data)->sdp_version) !=
+           SDP_MAJ_VERSION)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int cma_rtu_recv(struct rdma_id_private *id_priv)
+{
+       int ret;
+
+       ret = cma_modify_qp_rts(&id_priv->id);
+       if (ret)
+               goto reject;
+
+       return 0;
+reject:
+       cma_modify_qp_err(&id_priv->id);
+       ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+                      NULL, 0, NULL, 0);
+       return ret;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+       struct rdma_id_private *id_priv = cm_id->context;
+       enum rdma_cm_event_type event;
+       u8 private_data_len = 0;
+       int ret = 0, status = 0;
+
+       atomic_inc(&id_priv->dev_remove);
+       if (!cma_comp(id_priv, CMA_CONNECT))
+               goto out;
+
+       switch (ib_event->event) {
+       case IB_CM_REQ_ERROR:
+       case IB_CM_REP_ERROR:
+               event = RDMA_CM_EVENT_UNREACHABLE;
+               status = -ETIMEDOUT;
+               break;
+       case IB_CM_REP_RECEIVED:
+               status = cma_verify_rep(id_priv, ib_event->private_data);
+               if (status)
+                       event = RDMA_CM_EVENT_CONNECT_ERROR;
+               else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) {
+                       status = cma_rep_recv(id_priv);
+                       event = status ? RDMA_CM_EVENT_CONNECT_ERROR :
+                                        RDMA_CM_EVENT_ESTABLISHED;
+               } else
+                       event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+               private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+               break;
+       case IB_CM_RTU_RECEIVED:
+               status = cma_rtu_recv(id_priv);
+               event = status ? RDMA_CM_EVENT_CONNECT_ERROR :
+                                RDMA_CM_EVENT_ESTABLISHED;
+               break;
+       case IB_CM_DREQ_ERROR:
+               status = -ETIMEDOUT; /* fall through */
+       case IB_CM_DREQ_RECEIVED:
+       case IB_CM_DREP_RECEIVED:
+               if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT))
+                       goto out;
+               event = RDMA_CM_EVENT_DISCONNECTED;
+               break;
+       case IB_CM_TIMEWAIT_EXIT:
+       case IB_CM_MRA_RECEIVED:
+               /* ignore event */
+               goto out;
+       case IB_CM_REJ_RECEIVED:
+               cma_modify_qp_err(&id_priv->id);
+               status = ib_event->param.rej_rcvd.reason;
+               event = RDMA_CM_EVENT_REJECTED;
+               break;
+       default:
+               printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d",
+                      ib_event->event);
+               goto out;
+       }
+
+       ret = cma_notify_user(id_priv, event, status, ib_event->private_data,
+                             private_data_len);
+       if (ret) {
+               /* Destroy the CM ID by returning a non-zero value. */
+               id_priv->cm_id.ib = NULL;
+               cma_exch(id_priv, CMA_DESTROYING);
+               cma_release_remove(id_priv);
+               rdma_destroy_id(&id_priv->id);
+               return ret;
+       }
+out:
+       cma_release_remove(id_priv);
+       return ret;
+}
+
+static struct rdma_id_private *cma_new_id(struct rdma_cm_id *listen_id,
+                                         struct ib_cm_event *ib_event)
+{
+       struct rdma_id_private *id_priv;
+       struct rdma_cm_id *id;
+       struct rdma_route *rt;
+       union cma_ip_addr *src, *dst;
+       __u16 port;
+       u8 ip_ver;
+
+       id = rdma_create_id(listen_id->event_handler, listen_id->context,
+                           listen_id->ps);
+       if (IS_ERR(id))
+               return NULL;
+
+       rt = &id->route;
+       rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+       rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, GFP_KERNEL);
+       if (!rt->path_rec)
+               goto err;
+
+       if (cma_get_net_info(ib_event->private_data, listen_id->ps,
+                            &ip_ver, &port, &src, &dst))
+               goto err;
+
+       cma_save_net_info(&id->route.addr, &listen_id->route.addr,
+                         ip_ver, port, src, dst);
+       rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
+       if (rt->num_paths == 2)
+               rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+       ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+       ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+       ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+       rt->addr.dev_addr.dev_type = IB_NODE_CA;
+
+       id_priv = container_of(id, struct rdma_id_private, id);
+       id_priv->state = CMA_CONNECT;
+       return id_priv;
+err:
+       rdma_destroy_id(id);
+       return NULL;
+}
+
+static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+       struct rdma_id_private *listen_id, *conn_id;
+       int offset, ret;
+
+       listen_id = cm_id->context;
+       atomic_inc(&listen_id->dev_remove);
+       if (!cma_comp(listen_id, CMA_LISTEN)) {
+               ret = -ECONNABORTED;
+               goto out;
+       }
+
+       conn_id = cma_new_id(&listen_id->id, ib_event);
+       if (!conn_id) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       atomic_inc(&conn_id->dev_remove);
+       ret = cma_acquire_ib_dev(conn_id);
+       if (ret) {
+               ret = -ENODEV;
+               cma_release_remove(conn_id);
+               rdma_destroy_id(&conn_id->id);
+               goto out;
+       }
+
+       conn_id->cm_id.ib = cm_id;
+       cm_id->context = conn_id;
+       cm_id->cm_handler = cma_ib_handler;
+
+       offset = cma_user_data_offset(listen_id->id.ps);
+       ret = cma_notify_user(conn_id, RDMA_CM_EVENT_CONNECT_REQUEST, 0,
+                             ib_event->private_data + offset,
+                             IB_CM_REQ_PRIVATE_DATA_SIZE - offset);
+       if (ret) {
+               /* Destroy the CM ID by returning a non-zero value. */
+               conn_id->cm_id.ib = NULL;
+               cma_exch(conn_id, CMA_DESTROYING);
+               cma_release_remove(conn_id);
+               rdma_destroy_id(&conn_id->id);
+       }
+out:
+       cma_release_remove(listen_id);
+       return ret;
+}
+
+static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr)
+{
+       return cpu_to_be64(((u64)ps << 16) +
+              be16_to_cpu(((struct sockaddr_in *) addr)->sin_port));
+}
+
+static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
+                                struct ib_cm_compare_data *compare)
+{
+       struct cma_hdr *cma_data, *cma_mask;
+       struct sdp_hh *sdp_data, *sdp_mask;
+       __u32 ip4_addr;
+       struct in6_addr ip6_addr;
+
+       memset(compare, 0, sizeof *compare);
+       cma_data = (void *) compare->data;
+       cma_mask = (void *) compare->mask;
+       sdp_data = (void *) compare->data;
+       sdp_mask = (void *) compare->mask;
+
+       switch (addr->sa_family) {
+       case AF_INET:
+               ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+               if (ps == RDMA_PS_SDP) {
+                       sdp_set_ip_ver(sdp_data, 4);
+                       sdp_set_ip_ver(sdp_mask, 0xF);
+                       sdp_data->dst_addr.ip4.addr = ip4_addr;
+                       sdp_mask->dst_addr.ip4.addr = ~0;
+               } else {
+                       cma_set_ip_ver(cma_data, 4);
+                       cma_set_ip_ver(cma_mask, 0xF);
+                       cma_data->dst_addr.ip4.addr = ip4_addr;
+                       cma_mask->dst_addr.ip4.addr = ~0;
+               }
+               break;
+       case AF_INET6:
+               ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
+               if (ps == RDMA_PS_SDP) {
+                       sdp_set_ip_ver(sdp_data, 6);
+                       sdp_set_ip_ver(sdp_mask, 0xF);
+                       sdp_data->dst_addr.ip6 = ip6_addr;
+                       memset(&sdp_mask->dst_addr.ip6, 0xFF,
+                              sizeof sdp_mask->dst_addr.ip6);
+               } else {
+                       cma_set_ip_ver(cma_data, 6);
+                       cma_set_ip_ver(cma_mask, 0xF);
+                       cma_data->dst_addr.ip6 = ip6_addr;
+                       memset(&cma_mask->dst_addr.ip6, 0xFF,
+                              sizeof cma_mask->dst_addr.ip6);
+               }
+               break;
+       default:
+               break;
+       }
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+       struct ib_cm_compare_data compare_data;
+       struct sockaddr *addr;
+       __be64 svc_id;
+       int ret;
+
+       id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler,
+                                           id_priv);
+       if (IS_ERR(id_priv->cm_id.ib))
+               return PTR_ERR(id_priv->cm_id.ib);
+
+       addr = &id_priv->id.route.addr.src_addr;
+       svc_id = cma_get_service_id(id_priv->id.ps, addr);
+       if (cma_any_addr(addr))
+               ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
+       else {
+               cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
+               ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
+       }
+
+       if (ret) {
+               ib_destroy_cm_id(id_priv->cm_id.ib);
+               id_priv->cm_id.ib = NULL;
+       }
+
+       return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+                             struct rdma_cm_event *event)
+{
+       struct rdma_id_private *id_priv = id->context;
+
+       id->context = id_priv->id.context;
+       id->event_handler = id_priv->id.event_handler;
+       return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+                             struct cma_device *cma_dev)
+{
+       struct rdma_id_private *dev_id_priv;
+       struct rdma_cm_id *id;
+       int ret;
+
+       id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps);
+       if (IS_ERR(id))
+               return;
+
+       dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+       dev_id_priv->state = CMA_ADDR_BOUND;
+       memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
+              ip_addr_size(&id_priv->id.route.addr.src_addr));
+
+       cma_attach_to_dev(dev_id_priv, cma_dev);
+       list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+
+       ret = rdma_listen(id, id_priv->backlog);
+       if (ret)
+               goto err;
+
+       return;
+err:
+       cma_destroy_listen(dev_id_priv);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+       struct cma_device *cma_dev;
+
+       mutex_lock(&lock);
+       list_add_tail(&id_priv->list, &listen_any_list);
+       list_for_each_entry(cma_dev, &dev_list, list)
+               cma_listen_on_dev(id_priv, cma_dev);
+       mutex_unlock(&lock);
+}
+
+static int cma_bind_any(struct rdma_cm_id *id, sa_family_t af)
+{
+       struct sockaddr_in addr_in;
+
+       memset(&addr_in, 0, sizeof addr_in);
+       addr_in.sin_family = af;
+       return rdma_bind_addr(id, (struct sockaddr *) &addr_in);
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+       struct rdma_id_private *id_priv;
+       int ret;
+
+       id_priv = container_of(id, struct rdma_id_private, id);
+       if (id_priv->state == CMA_IDLE) {
+               ret = cma_bind_any(id, AF_INET);
+               if (ret)
+                       return ret;
+       }
+
+       if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
+               return -EINVAL;
+
+       id_priv->backlog = backlog;
+       if (id->device) {
+               switch (id->device->node_type) {
+               case IB_NODE_CA:
+                       ret = cma_ib_listen(id_priv);
+                       if (ret)
+                               goto err;
+                       break;
+               default:
+                       ret = -ENOSYS;
+                       goto err;
+               }
+       } else
+               cma_listen_on_all(id_priv);
+
+       return 0;
+err:
+       id_priv->backlog = 0;
+       cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
+       return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
+                             void *context)
+{
+       struct cma_work *work = context;
+       struct rdma_route *route;
+
+       route = &work->id->id.route;
+
+       if (!status) {
+               route->num_paths = 1;
+               *route->path_rec = *path_rec;
+       } else {
+               work->old_state = CMA_ROUTE_QUERY;
+               work->new_state = CMA_ADDR_RESOLVED;
+               work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+       }
+
+       queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+                             struct cma_work *work)
+{
+       struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr;
+       struct ib_sa_path_rec path_rec;
+
+       memset(&path_rec, 0, sizeof path_rec);
+       path_rec.sgid = *ib_addr_get_sgid(addr);
+       path_rec.dgid = *ib_addr_get_dgid(addr);
+       path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr));
+       path_rec.numb_path = 1;
+
+       id_priv->query_id = ib_sa_path_rec_get(id_priv->id.device,
+                               id_priv->id.port_num, &path_rec,
+                               IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+                               IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH,
+                               timeout_ms, GFP_KERNEL,
+                               cma_query_handler, work, &id_priv->query);
+
+       return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_work_handler(void *data)
+{
+       struct cma_work *work = data;
+       struct rdma_id_private *id_priv = work->id;
+       int destroy = 0;
+
+       atomic_inc(&id_priv->dev_remove);
+       if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+               goto out;
+
+       if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+               cma_exch(id_priv, CMA_DESTROYING);
+               destroy = 1;
+       }
+out:
+       cma_release_remove(id_priv);
+       cma_deref_id(id_priv);
+       if (destroy)
+               rdma_destroy_id(&id_priv->id);
+       kfree(work);
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+       struct rdma_route *route = &id_priv->id.route;
+       struct cma_work *work;
+       int ret;
+
+       work = kzalloc(sizeof *work, GFP_KERNEL);
+       if (!work)
+               return -ENOMEM;
+
+       work->id = id_priv;
+       INIT_WORK(&work->work, cma_work_handler, work);
+       work->old_state = CMA_ROUTE_QUERY;
+       work->new_state = CMA_ROUTE_RESOLVED;
+       work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+
+       route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+       if (!route->path_rec) {
+               ret = -ENOMEM;
+               goto err1;
+       }
+
+       ret = cma_query_ib_route(id_priv, timeout_ms, work);
+       if (ret)
+               goto err2;
+
+       return 0;
+err2:
+       kfree(route->path_rec);
+       route->path_rec = NULL;
+err1:
+       kfree(work);
+       return ret;
+}
+
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+                     struct ib_sa_path_rec *path_rec, int num_paths)
+{
+       struct rdma_id_private *id_priv;
+       int ret;
+
+       id_priv = container_of(id, struct rdma_id_private, id);
+       if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED))
+               return -EINVAL;
+
+       id->route.path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL);
+       if (!id->route.path_rec) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths);
+       return 0;
+err:
+       cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED);
+       return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_paths);
+
+int rdma_resolve_route(struct rdma_cm_id&nb