x86, sparseirq: move irq_desc according to smp_affinity, v7
Yinghai Lu [Thu, 11 Dec 2008 08:15:01 +0000 (00:15 -0800)]
Impact: improve NUMA handling by migrating irq_desc on smp_affinity changes

if CONFIG_NUMA_MIGRATE_IRQ_DESC is set:

-  make irq_desc to go with affinity aka irq_desc moving etc
-  call move_irq_desc in irq_complete_move()
-  legacy irq_desc is not moved, because they are allocated via static array

for logical apic mode, need to add move_desc_in_progress_in_same_domain,
otherwise it will not be moved ==> also could need two phases to get
irq_desc moved.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

arch/x86/Kconfig
arch/x86/kernel/io_apic.c
include/linux/irq.h
kernel/irq/Makefile
kernel/irq/chip.c
kernel/irq/handle.c
kernel/irq/internals.h
kernel/irq/numa_migrate.c [new file with mode: 0644]

index 8943c13..2907353 100644 (file)
@@ -248,6 +248,15 @@ config SPARSE_IRQ
 
          If you don't know what to do here, say Y.
 
+config NUMA_MIGRATE_IRQ_DESC
+       bool "Move irq desc when changing irq smp_affinity"
+       depends on SPARSE_IRQ && SMP
+       default n
+       help
+         This enables moving irq_desc to cpu/node that irq will use handled.
+
+         If you don't know what to do here, say N.
+
 config X86_FIND_SMP_CONFIG
        def_bool y
        depends on X86_MPPARSE || X86_VOYAGER
index a1a2e07..bfe1245 100644 (file)
@@ -141,6 +141,9 @@ struct irq_cfg {
        unsigned move_cleanup_count;
        u8 vector;
        u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+       u8 move_desc_pending : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -223,6 +226,121 @@ void arch_init_chip_data(struct irq_desc *desc, int cpu)
        }
 }
 
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+{
+       struct irq_pin_list *old_entry, *head, *tail, *entry;
+
+       cfg->irq_2_pin = NULL;
+       old_entry = old_cfg->irq_2_pin;
+       if (!old_entry)
+               return;
+
+       entry = get_one_free_irq_2_pin(cpu);
+       if (!entry)
+               return;
+
+       entry->apic     = old_entry->apic;
+       entry->pin      = old_entry->pin;
+       head            = entry;
+       tail            = entry;
+       old_entry       = old_entry->next;
+       while (old_entry) {
+               entry = get_one_free_irq_2_pin(cpu);
+               if (!entry) {
+                       entry = head;
+                       while (entry) {
+                               head = entry->next;
+                               kfree(entry);
+                               entry = head;
+                       }
+                       /* still use the old one */
+                       return;
+               }
+               entry->apic     = old_entry->apic;
+               entry->pin      = old_entry->pin;
+               tail->next      = entry;
+               tail            = entry;
+               old_entry       = old_entry->next;
+       }
+
+       tail->next = NULL;
+       cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+{
+       struct irq_pin_list *entry, *next;
+
+       if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+               return;
+
+       entry = old_cfg->irq_2_pin;
+
+       while (entry) {
+               next = entry->next;
+               kfree(entry);
+               entry = next;
+       }
+       old_cfg->irq_2_pin = NULL;
+}
+
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+                                struct irq_desc *desc, int cpu)
+{
+       struct irq_cfg *cfg;
+       struct irq_cfg *old_cfg;
+
+       cfg = get_one_free_irq_cfg(cpu);
+
+       if (!cfg)
+               return;
+
+       desc->chip_data = cfg;
+
+       old_cfg = old_desc->chip_data;
+
+       memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+       init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+       kfree(old_cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+       struct irq_cfg *old_cfg, *cfg;
+
+       old_cfg = old_desc->chip_data;
+       cfg = desc->chip_data;
+
+       if (old_cfg == cfg)
+               return;
+
+       if (old_cfg) {
+               free_irq_2_pin(old_cfg, cfg);
+               free_irq_cfg(old_cfg);
+               old_desc->chip_data = NULL;
+       }
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+       struct irq_cfg *cfg = desc->chip_data;
+
+       if (!cfg->move_in_progress) {
+               /* it means that domain is not changed */
+               if (!cpus_intersects(desc->affinity, mask))
+                       cfg->move_desc_pending = 1;
+       }
+}
+#endif
+
 #else
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
@@ -231,9 +349,11 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 
 #endif
 
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
 static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
 {
 }
+#endif
 
 struct io_apic {
        unsigned int index;
@@ -2346,14 +2466,34 @@ static void irq_complete_move(struct irq_desc **descp)
        struct irq_cfg *cfg = desc->chip_data;
        unsigned vector, me;
 
-       if (likely(!cfg->move_in_progress))
+       if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+               if (likely(!cfg->move_desc_pending))
+                       return;
+
+               /* domain is not change, but affinity is changed */
+               me = smp_processor_id();
+               if (cpu_isset(me, desc->affinity)) {
+                       *descp = desc = move_irq_desc(desc, me);
+                       /* get the new one */
+                       cfg = desc->chip_data;
+                       cfg->move_desc_pending = 0;
+               }
+#endif
                return;
+       }
 
        vector = ~get_irq_regs()->orig_ax;
        me = smp_processor_id();
        if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
                cpumask_t cleanup_mask;
 
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+               *descp = desc = move_irq_desc(desc, me);
+               /* get the new one */
+               cfg = desc->chip_data;
+#endif
+
                cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
                cfg->move_cleanup_count = cpus_weight(cleanup_mask);
                send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
index b5749db..36a0157 100644 (file)
@@ -227,6 +227,16 @@ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
 
 #endif
 
+static inline struct irq_desc *
+irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
+{
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+       return irq_to_desc(irq);
+#else
+       return desc;
+#endif
+}
+
 /*
  * Migration helpers for obsolete names, they will go away:
  */
index 681c52d..4dd5b1e 100644 (file)
@@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
+obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
index 8e4fce4..de210f4 100644 (file)
@@ -353,6 +353,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
        spin_lock(&desc->lock);
        mask_ack_irq(desc, irq);
+       desc = irq_remap_to_desc(irq, desc);
 
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
@@ -430,6 +431,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        desc->status &= ~IRQ_INPROGRESS;
 out:
        desc->chip->eoi(irq);
+       desc = irq_remap_to_desc(irq, desc);
 
        spin_unlock(&desc->lock);
 }
@@ -466,12 +468,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                    !desc->action)) {
                desc->status |= (IRQ_PENDING | IRQ_MASKED);
                mask_ack_irq(desc, irq);
+               desc = irq_remap_to_desc(irq, desc);
                goto out_unlock;
        }
        kstat_incr_irqs_this_cpu(irq, desc);
 
        /* Start handling the irq */
        desc->chip->ack(irq);
+       desc = irq_remap_to_desc(irq, desc);
 
        /* Mark the IRQ currently in progress.*/
        desc->status |= IRQ_INPROGRESS;
@@ -532,8 +536,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
 
-       if (desc->chip->eoi)
+       if (desc->chip->eoi) {
                desc->chip->eoi(irq);
+               desc = irq_remap_to_desc(irq, desc);
+       }
 }
 
 void
@@ -568,8 +574,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 
        /* Uninstall? */
        if (handle == handle_bad_irq) {
-               if (desc->chip != &no_irq_chip)
+               if (desc->chip != &no_irq_chip) {
                        mask_ack_irq(desc, irq);
+                       desc = irq_remap_to_desc(irq, desc);
+               }
                desc->status |= IRQ_DISABLED;
                desc->depth = 1;
        }
index 8aa0954..f1a2306 100644 (file)
@@ -23,7 +23,7 @@
 /*
  * lockdep: we want to handle all irq_desc locks as a single lock-class:
  */
-static struct lock_class_key irq_desc_lock_class;
+struct lock_class_key irq_desc_lock_class;
 
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
@@ -73,7 +73,7 @@ static struct irq_desc irq_desc_init = {
 #endif
 };
 
-static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 {
        unsigned long bytes;
        char *ptr;
@@ -113,7 +113,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 /*
  * Protect the sparse_irqs:
  */
-static DEFINE_SPINLOCK(sparse_irq_lock);
+DEFINE_SPINLOCK(sparse_irq_lock);
 
 struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
 
@@ -337,8 +337,11 @@ unsigned int __do_IRQ(unsigned int irq)
                /*
                 * No locking required for CPU-local interrupts:
                 */
-               if (desc->chip->ack)
+               if (desc->chip->ack) {
                        desc->chip->ack(irq);
+                       /* get new one */
+                       desc = irq_remap_to_desc(irq, desc);
+               }
                if (likely(!(desc->status & IRQ_DISABLED))) {
                        action_ret = handle_IRQ_event(irq, desc->action);
                        if (!noirqdebug)
@@ -349,8 +352,10 @@ unsigned int __do_IRQ(unsigned int irq)
        }
 
        spin_lock(&desc->lock);
-       if (desc->chip->ack)
+       if (desc->chip->ack) {
                desc->chip->ack(irq);
+               desc = irq_remap_to_desc(irq, desc);
+       }
        /*
         * REPLAY is when Linux resends an IRQ that was dropped earlier
         * WAITING is used by probe to mark irqs that are being tested
index 64c1c72..e6d0a43 100644 (file)
@@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                unsigned long flags);
 
+extern struct lock_class_key irq_desc_lock_class;
+extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern spinlock_t sparse_irq_lock;
+extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
new file mode 100644 (file)
index 0000000..0178e22
--- /dev/null
@@ -0,0 +1,127 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+static void init_copy_kstat_irqs(struct irq_desc *old_desc,
+                                struct irq_desc *desc,
+                                int cpu, int nr)
+{
+       unsigned long bytes;
+
+       init_kstat_irqs(desc, cpu, nr);
+
+       if (desc->kstat_irqs != old_desc->kstat_irqs) {
+               /* Compute how many bytes we need per irq and allocate them */
+               bytes = nr * sizeof(unsigned int);
+
+               memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+       }
+}
+
+static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+       if (old_desc->kstat_irqs == desc->kstat_irqs)
+               return;
+
+       kfree(old_desc->kstat_irqs);
+       old_desc->kstat_irqs = NULL;
+}
+
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+                struct irq_desc *desc, int cpu)
+{
+       memcpy(desc, old_desc, sizeof(struct irq_desc));
+       desc->cpu = cpu;
+       lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+       init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+       arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+       free_kstat_irqs(old_desc, desc);
+       arch_free_chip_data(old_desc, desc);
+}
+
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+                                               int cpu)
+{
+       struct irq_desc *desc;
+       unsigned int irq;
+       unsigned long flags;
+       int node;
+
+       irq = old_desc->irq;
+
+       spin_lock_irqsave(&sparse_irq_lock, flags);
+
+       /* We have to check it to avoid races with another CPU */
+       desc = irq_desc_ptrs[irq];
+
+       if (desc && old_desc != desc)
+                       goto out_unlock;
+
+       node = cpu_to_node(cpu);
+       desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+       printk(KERN_DEBUG "  move irq_desc for %d to cpu %d node %d\n",
+                irq, cpu, node);
+       if (!desc) {
+               printk(KERN_ERR "can not get new irq_desc for moving\n");
+               /* still use old one */
+               desc = old_desc;
+               goto out_unlock;
+       }
+       init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+       irq_desc_ptrs[irq] = desc;
+
+       /* free the old one */
+       free_one_irq_desc(old_desc, desc);
+       kfree(old_desc);
+
+out_unlock:
+       spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+       return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+       int old_cpu;
+       int node, old_node;
+
+       /* those all static, do move them */
+       if (desc->irq < NR_IRQS_LEGACY)
+               return desc;
+
+       old_cpu = desc->cpu;
+       printk(KERN_DEBUG
+                "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+       if (old_cpu != cpu) {
+               node = cpu_to_node(cpu);
+               old_node = cpu_to_node(old_cpu);
+               if (old_node != node)
+                       desc = __real_move_irq_desc(desc, cpu);
+               else
+                       desc->cpu = cpu;
+       }
+
+       return desc;
+}
+