[IA64] - Avoid slow TLB purges on SGI Altix systems
[linux-2.6.git] / arch / ia64 / sn / kernel / sn2 / sn2_smp.c
1 /*
2  * SN2 Platform specific SMP Support
3  *
4  * This file is subject to the terms and conditions of the GNU General Public
5  * License.  See the file "COPYING" in the main directory of this archive
6  * for more details.
7  *
8  * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
9  */
10
11 #include <linux/init.h>
12 #include <linux/kernel.h>
13 #include <linux/spinlock.h>
14 #include <linux/threads.h>
15 #include <linux/sched.h>
16 #include <linux/smp.h>
17 #include <linux/interrupt.h>
18 #include <linux/irq.h>
19 #include <linux/mmzone.h>
20 #include <linux/module.h>
21 #include <linux/bitops.h>
22 #include <linux/nodemask.h>
23 #include <linux/proc_fs.h>
24 #include <linux/seq_file.h>
25
26 #include <asm/processor.h>
27 #include <asm/irq.h>
28 #include <asm/sal.h>
29 #include <asm/system.h>
30 #include <asm/delay.h>
31 #include <asm/io.h>
32 #include <asm/smp.h>
33 #include <asm/tlb.h>
34 #include <asm/numa.h>
35 #include <asm/hw_irq.h>
36 #include <asm/current.h>
37 #include <asm/sn/sn_cpuid.h>
38 #include <asm/sn/sn_sal.h>
39 #include <asm/sn/addrs.h>
40 #include <asm/sn/shub_mmr.h>
41 #include <asm/sn/nodepda.h>
42 #include <asm/sn/rw_mmr.h>
43
44 DEFINE_PER_CPU(struct ptc_stats, ptcstats);
45 DECLARE_PER_CPU(struct ptc_stats, ptcstats);
46
47 static  __cacheline_aligned DEFINE_SPINLOCK(sn2_global_ptc_lock);
48
49 void sn2_ptc_deadlock_recovery(short *, short, int, volatile unsigned long *, unsigned long data0,
50         volatile unsigned long *, unsigned long data1);
51
52 #ifdef DEBUG_PTC
53 /*
54  * ptctest:
55  *
56  *      xyz - 3 digit hex number:
57  *              x - Force PTC purges to use shub:
58  *                      0 - no force
59  *                      1 - force
60  *              y - interupt enable
61  *                      0 - disable interrupts
62  *                      1 - leave interuupts enabled
63  *              z - type of lock:
64  *                      0 - global lock
65  *                      1 - node local lock
66  *                      2 - no lock
67  *
68  *      Note: on shub1, only ptctest == 0 is supported. Don't try other values!
69  */
70
71 static unsigned int sn2_ptctest = 0;
72
73 static int __init ptc_test(char *str)
74 {
75         get_option(&str, &sn2_ptctest);
76         return 1;
77 }
78 __setup("ptctest=", ptc_test);
79
80 static inline int ptc_lock(unsigned long *flagp)
81 {
82         unsigned long opt = sn2_ptctest & 255;
83
84         switch (opt) {
85         case 0x00:
86                 spin_lock_irqsave(&sn2_global_ptc_lock, *flagp);
87                 break;
88         case 0x01:
89                 spin_lock_irqsave(&sn_nodepda->ptc_lock, *flagp);
90                 break;
91         case 0x02:
92                 local_irq_save(*flagp);
93                 break;
94         case 0x10:
95                 spin_lock(&sn2_global_ptc_lock);
96                 break;
97         case 0x11:
98                 spin_lock(&sn_nodepda->ptc_lock);
99                 break;
100         case 0x12:
101                 break;
102         default:
103                 BUG();
104         }
105         return opt;
106 }
107
108 static inline void ptc_unlock(unsigned long flags, int opt)
109 {
110         switch (opt) {
111         case 0x00:
112                 spin_unlock_irqrestore(&sn2_global_ptc_lock, flags);
113                 break;
114         case 0x01:
115                 spin_unlock_irqrestore(&sn_nodepda->ptc_lock, flags);
116                 break;
117         case 0x02:
118                 local_irq_restore(flags);
119                 break;
120         case 0x10:
121                 spin_unlock(&sn2_global_ptc_lock);
122                 break;
123         case 0x11:
124                 spin_unlock(&sn_nodepda->ptc_lock);
125                 break;
126         case 0x12:
127                 break;
128         default:
129                 BUG();
130         }
131 }
132 #else
133
134 #define sn2_ptctest     0
135
136 static inline int ptc_lock(unsigned long *flagp)
137 {
138         spin_lock_irqsave(&sn2_global_ptc_lock, *flagp);
139         return 0;
140 }
141
142 static inline void ptc_unlock(unsigned long flags, int opt)
143 {
144         spin_unlock_irqrestore(&sn2_global_ptc_lock, flags);
145 }
146 #endif
147
148 struct ptc_stats {
149         unsigned long ptc_l;
150         unsigned long change_rid;
151         unsigned long shub_ptc_flushes;
152         unsigned long nodes_flushed;
153         unsigned long deadlocks;
154         unsigned long lock_itc_clocks;
155         unsigned long shub_itc_clocks;
156         unsigned long shub_itc_clocks_max;
157 };
158
159 static inline unsigned long wait_piowc(void)
160 {
161         volatile unsigned long *piows, zeroval;
162         unsigned long ws;
163
164         piows = pda->pio_write_status_addr;
165         zeroval = pda->pio_write_status_val;
166         do {
167                 cpu_relax();
168         } while (((ws = *piows) & SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK) != zeroval);
169         return ws;
170 }
171
172 void sn_tlb_migrate_finish(struct mm_struct *mm)
173 {
174         if (mm == current->mm)
175                 flush_tlb_mm(mm);
176 }
177
178 /**
179  * sn2_global_tlb_purge - globally purge translation cache of virtual address range
180  * @mm: mm_struct containing virtual address range
181  * @start: start of virtual address range
182  * @end: end of virtual address range
183  * @nbits: specifies number of bytes to purge per instruction (num = 1<<(nbits & 0xfc))
184  *
185  * Purges the translation caches of all processors of the given virtual address
186  * range.
187  *
188  * Note:
189  *      - cpu_vm_mask is a bit mask that indicates which cpus have loaded the context.
190  *      - cpu_vm_mask is converted into a nodemask of the nodes containing the
191  *        cpus in cpu_vm_mask.
192  *      - if only one bit is set in cpu_vm_mask & it is the current cpu & the
193  *        process is purging its own virtual address range, then only the
194  *        local TLB needs to be flushed. This flushing can be done using
195  *        ptc.l. This is the common case & avoids the global spinlock.
196  *      - if multiple cpus have loaded the context, then flushing has to be
197  *        done with ptc.g/MMRs under protection of the global ptc_lock.
198  */
199
200 void
201 sn2_global_tlb_purge(struct mm_struct *mm, unsigned long start,
202                      unsigned long end, unsigned long nbits)
203 {
204         int i, opt, shub1, cnode, mynasid, cpu, lcpu = 0, nasid, flushed = 0;
205         int mymm = (mm == current->active_mm);
206         volatile unsigned long *ptc0, *ptc1;
207         unsigned long itc, itc2, flags, data0 = 0, data1 = 0, rr_value;
208         short nasids[MAX_NUMNODES], nix;
209         nodemask_t nodes_flushed;
210
211         nodes_clear(nodes_flushed);
212         i = 0;
213
214         for_each_cpu_mask(cpu, mm->cpu_vm_mask) {
215                 cnode = cpu_to_node(cpu);
216                 node_set(cnode, nodes_flushed);
217                 lcpu = cpu;
218                 i++;
219         }
220
221         if (i == 0)
222                 return;
223
224         preempt_disable();
225
226         if (likely(i == 1 && lcpu == smp_processor_id() && mymm)) {
227                 do {
228                         ia64_ptcl(start, nbits << 2);
229                         start += (1UL << nbits);
230                 } while (start < end);
231                 ia64_srlz_i();
232                 __get_cpu_var(ptcstats).ptc_l++;
233                 preempt_enable();
234                 return;
235         }
236
237         if (atomic_read(&mm->mm_users) == 1 && mymm) {
238                 flush_tlb_mm(mm);
239                 __get_cpu_var(ptcstats).change_rid++;
240                 preempt_enable();
241                 return;
242         }
243
244         itc = ia64_get_itc();
245         nix = 0;
246         for_each_node_mask(cnode, nodes_flushed)
247                 nasids[nix++] = cnodeid_to_nasid(cnode);
248
249         rr_value = (mm->context << 3) | REGION_NUMBER(start);
250
251         shub1 = is_shub1();
252         if (shub1) {
253                 data0 = (1UL << SH1_PTC_0_A_SHFT) |
254                         (nbits << SH1_PTC_0_PS_SHFT) |
255                         (rr_value << SH1_PTC_0_RID_SHFT) |
256                         (1UL << SH1_PTC_0_START_SHFT);
257                 ptc0 = (long *)GLOBAL_MMR_PHYS_ADDR(0, SH1_PTC_0);
258                 ptc1 = (long *)GLOBAL_MMR_PHYS_ADDR(0, SH1_PTC_1);
259         } else {
260                 data0 = (1UL << SH2_PTC_A_SHFT) |
261                         (nbits << SH2_PTC_PS_SHFT) |
262                         (1UL << SH2_PTC_START_SHFT);
263                 ptc0 = (long *)GLOBAL_MMR_PHYS_ADDR(0, SH2_PTC + 
264                         (rr_value << SH2_PTC_RID_SHFT));
265                 ptc1 = NULL;
266         }
267         
268
269         mynasid = get_nasid();
270
271         itc = ia64_get_itc();
272         opt = ptc_lock(&flags);
273         itc2 = ia64_get_itc();
274         __get_cpu_var(ptcstats).lock_itc_clocks += itc2 - itc;
275         __get_cpu_var(ptcstats).shub_ptc_flushes++;
276         __get_cpu_var(ptcstats).nodes_flushed += nix;
277
278         do {
279                 if (shub1)
280                         data1 = start | (1UL << SH1_PTC_1_START_SHFT);
281                 else
282                         data0 = (data0 & ~SH2_PTC_ADDR_MASK) | (start & SH2_PTC_ADDR_MASK);
283                 for (i = 0; i < nix; i++) {
284                         nasid = nasids[i];
285                         if ((!(sn2_ptctest & 3)) && unlikely(nasid == mynasid && mymm)) {
286                                 ia64_ptcga(start, nbits << 2);
287                                 ia64_srlz_i();
288                         } else {
289                                 ptc0 = CHANGE_NASID(nasid, ptc0);
290                                 if (ptc1)
291                                         ptc1 = CHANGE_NASID(nasid, ptc1);
292                                 pio_atomic_phys_write_mmrs(ptc0, data0, ptc1,
293                                                            data1);
294                                 flushed = 1;
295                         }
296                 }
297                 if (flushed
298                     && (wait_piowc() &
299                                 (SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK))) {
300                         sn2_ptc_deadlock_recovery(nasids, nix, mynasid, ptc0, data0, ptc1, data1);
301                 }
302
303                 start += (1UL << nbits);
304
305         } while (start < end);
306
307         itc2 = ia64_get_itc() - itc2;
308         __get_cpu_var(ptcstats).shub_itc_clocks += itc2;
309         if (itc2 > __get_cpu_var(ptcstats).shub_itc_clocks_max)
310                 __get_cpu_var(ptcstats).shub_itc_clocks_max = itc2;
311
312         ptc_unlock(flags, opt);
313
314         preempt_enable();
315 }
316
317 /*
318  * sn2_ptc_deadlock_recovery
319  *
320  * Recover from PTC deadlocks conditions. Recovery requires stepping thru each 
321  * TLB flush transaction.  The recovery sequence is somewhat tricky & is
322  * coded in assembly language.
323  */
324 void sn2_ptc_deadlock_recovery(short *nasids, short nix, int mynasid, volatile unsigned long *ptc0, unsigned long data0,
325         volatile unsigned long *ptc1, unsigned long data1)
326 {
327         extern void sn2_ptc_deadlock_recovery_core(volatile unsigned long *, unsigned long,
328                 volatile unsigned long *, unsigned long, volatile unsigned long *, unsigned long);
329         short nasid, i;
330         unsigned long *piows, zeroval;
331
332         __get_cpu_var(ptcstats).deadlocks++;
333
334         piows = (unsigned long *) pda->pio_write_status_addr;
335         zeroval = pda->pio_write_status_val;
336
337         for (i=0; i < nix; i++) {
338                 nasid = nasids[i];
339                 if (!(sn2_ptctest & 3) && nasid == mynasid)
340                         continue;
341                 ptc0 = CHANGE_NASID(nasid, ptc0);
342                 if (ptc1)
343                         ptc1 = CHANGE_NASID(nasid, ptc1);
344                 sn2_ptc_deadlock_recovery_core(ptc0, data0, ptc1, data1, piows, zeroval);
345         }
346
347 }
348
349 /**
350  * sn_send_IPI_phys - send an IPI to a Nasid and slice
351  * @nasid: nasid to receive the interrupt (may be outside partition)
352  * @physid: physical cpuid to receive the interrupt.
353  * @vector: command to send
354  * @delivery_mode: delivery mechanism
355  *
356  * Sends an IPI (interprocessor interrupt) to the processor specified by
357  * @physid
358  *
359  * @delivery_mode can be one of the following
360  *
361  * %IA64_IPI_DM_INT - pend an interrupt
362  * %IA64_IPI_DM_PMI - pend a PMI
363  * %IA64_IPI_DM_NMI - pend an NMI
364  * %IA64_IPI_DM_INIT - pend an INIT interrupt
365  */
366 void sn_send_IPI_phys(int nasid, long physid, int vector, int delivery_mode)
367 {
368         long val;
369         unsigned long flags = 0;
370         volatile long *p;
371
372         p = (long *)GLOBAL_MMR_PHYS_ADDR(nasid, SH_IPI_INT);
373         val = (1UL << SH_IPI_INT_SEND_SHFT) |
374             (physid << SH_IPI_INT_PID_SHFT) |
375             ((long)delivery_mode << SH_IPI_INT_TYPE_SHFT) |
376             ((long)vector << SH_IPI_INT_IDX_SHFT) |
377             (0x000feeUL << SH_IPI_INT_BASE_SHFT);
378
379         mb();
380         if (enable_shub_wars_1_1()) {
381                 spin_lock_irqsave(&sn2_global_ptc_lock, flags);
382         }
383         pio_phys_write_mmr(p, val);
384         if (enable_shub_wars_1_1()) {
385                 wait_piowc();
386                 spin_unlock_irqrestore(&sn2_global_ptc_lock, flags);
387         }
388
389 }
390
391 EXPORT_SYMBOL(sn_send_IPI_phys);
392
393 /**
394  * sn2_send_IPI - send an IPI to a processor
395  * @cpuid: target of the IPI
396  * @vector: command to send
397  * @delivery_mode: delivery mechanism
398  * @redirect: redirect the IPI?
399  *
400  * Sends an IPI (InterProcessor Interrupt) to the processor specified by
401  * @cpuid.  @vector specifies the command to send, while @delivery_mode can 
402  * be one of the following
403  *
404  * %IA64_IPI_DM_INT - pend an interrupt
405  * %IA64_IPI_DM_PMI - pend a PMI
406  * %IA64_IPI_DM_NMI - pend an NMI
407  * %IA64_IPI_DM_INIT - pend an INIT interrupt
408  */
409 void sn2_send_IPI(int cpuid, int vector, int delivery_mode, int redirect)
410 {
411         long physid;
412         int nasid;
413
414         physid = cpu_physical_id(cpuid);
415         nasid = cpuid_to_nasid(cpuid);
416
417         /* the following is used only when starting cpus at boot time */
418         if (unlikely(nasid == -1))
419                 ia64_sn_get_sapic_info(physid, &nasid, NULL, NULL);
420
421         sn_send_IPI_phys(nasid, physid, vector, delivery_mode);
422 }
423
424 #ifdef CONFIG_PROC_FS
425
426 #define PTC_BASENAME    "sgi_sn/ptc_statistics"
427
428 static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
429 {
430         if (*offset < NR_CPUS)
431                 return offset;
432         return NULL;
433 }
434
435 static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset)
436 {
437         (*offset)++;
438         if (*offset < NR_CPUS)
439                 return offset;
440         return NULL;
441 }
442
443 static void sn2_ptc_seq_stop(struct seq_file *file, void *data)
444 {
445 }
446
447 static int sn2_ptc_seq_show(struct seq_file *file, void *data)
448 {
449         struct ptc_stats *stat;
450         int cpu;
451
452         cpu = *(loff_t *) data;
453
454         if (!cpu) {
455                 seq_printf(file, "# ptc_l change_rid shub_ptc_flushes shub_nodes_flushed deadlocks lock_nsec shub_nsec shub_nsec_max\n");
456                 seq_printf(file, "# ptctest %d\n", sn2_ptctest);
457         }
458
459         if (cpu < NR_CPUS && cpu_online(cpu)) {
460                 stat = &per_cpu(ptcstats, cpu);
461                 seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
462                                 stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed,
463                                 stat->deadlocks,
464                                 1000 * stat->lock_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
465                                 1000 * stat->shub_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
466                                 1000 * stat->shub_itc_clocks_max / per_cpu(cpu_info, cpu).cyc_per_usec);
467         }
468
469         return 0;
470 }
471
472 static struct seq_operations sn2_ptc_seq_ops = {
473         .start = sn2_ptc_seq_start,
474         .next = sn2_ptc_seq_next,
475         .stop = sn2_ptc_seq_stop,
476         .show = sn2_ptc_seq_show
477 };
478
479 int sn2_ptc_proc_open(struct inode *inode, struct file *file)
480 {
481         return seq_open(file, &sn2_ptc_seq_ops);
482 }
483
484 static struct file_operations proc_sn2_ptc_operations = {
485         .open = sn2_ptc_proc_open,
486         .read = seq_read,
487         .llseek = seq_lseek,
488         .release = seq_release,
489 };
490
491 static struct proc_dir_entry *proc_sn2_ptc;
492
493 static int __init sn2_ptc_init(void)
494 {
495         if (!(proc_sn2_ptc = create_proc_entry(PTC_BASENAME, 0444, NULL))) {
496                 printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME);
497                 return -EINVAL;
498         }
499         proc_sn2_ptc->proc_fops = &proc_sn2_ptc_operations;
500         spin_lock_init(&sn2_global_ptc_lock);
501         return 0;
502 }
503
504 static void __exit sn2_ptc_exit(void)
505 {
506         remove_proc_entry(PTC_BASENAME, NULL);
507 }
508
509 module_init(sn2_ptc_init);
510 module_exit(sn2_ptc_exit);
511 #endif /* CONFIG_PROC_FS */
512