drbd: Make sure tl_restart(, resend) can not get called multiple times for a new...
[linux-2.6.git] / drivers / block / drbd / drbd_main.c
1 /*
2    drbd.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27  */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/smp_lock.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60         struct drbd_work w;
61         union drbd_state os;
62         union drbd_state ns;
63         enum chg_state_flags flags;
64         struct completion *done;
65 };
66
67 int drbdd_init(struct drbd_thread *);
68 int drbd_worker(struct drbd_thread *);
69 int drbd_asender(struct drbd_thread *);
70
71 int drbd_init(void);
72 static int drbd_open(struct block_device *bdev, fmode_t mode);
73 static int drbd_release(struct gendisk *gd, fmode_t mode);
74 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76                            union drbd_state ns, enum chg_state_flags flags);
77 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78 static void md_sync_timer_fn(unsigned long data);
79 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80
81 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82               "Lars Ellenberg <lars@linbit.com>");
83 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84 MODULE_VERSION(REL_VERSION);
85 MODULE_LICENSE("GPL");
86 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
87 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88
89 #include <linux/moduleparam.h>
90 /* allow_open_on_secondary */
91 MODULE_PARM_DESC(allow_oos, "DONT USE!");
92 /* thanks to these macros, if compiled into the kernel (not-module),
93  * this becomes the boot parameter drbd.minor_count */
94 module_param(minor_count, uint, 0444);
95 module_param(disable_sendpage, bool, 0644);
96 module_param(allow_oos, bool, 0);
97 module_param(cn_idx, uint, 0444);
98 module_param(proc_details, int, 0644);
99
100 #ifdef CONFIG_DRBD_FAULT_INJECTION
101 int enable_faults;
102 int fault_rate;
103 static int fault_count;
104 int fault_devs;
105 /* bitmap of enabled faults */
106 module_param(enable_faults, int, 0664);
107 /* fault rate % value - applies to all enabled faults */
108 module_param(fault_rate, int, 0664);
109 /* count of faults inserted */
110 module_param(fault_count, int, 0664);
111 /* bitmap of devices to insert faults on */
112 module_param(fault_devs, int, 0644);
113 #endif
114
115 /* module parameter, defined */
116 unsigned int minor_count = 32;
117 int disable_sendpage;
118 int allow_oos;
119 unsigned int cn_idx = CN_IDX_DRBD;
120 int proc_details;       /* Detail level in proc drbd*/
121
122 /* Module parameter for setting the user mode helper program
123  * to run. Default is /sbin/drbdadm */
124 char usermode_helper[80] = "/sbin/drbdadm";
125
126 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
127
128 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
129  * as member "struct gendisk *vdisk;"
130  */
131 struct drbd_conf **minor_table;
132
133 struct kmem_cache *drbd_request_cache;
134 struct kmem_cache *drbd_ee_cache;       /* epoch entries */
135 struct kmem_cache *drbd_bm_ext_cache;   /* bitmap extents */
136 struct kmem_cache *drbd_al_ext_cache;   /* activity log extents */
137 mempool_t *drbd_request_mempool;
138 mempool_t *drbd_ee_mempool;
139
140 /* I do not use a standard mempool, because:
141    1) I want to hand out the pre-allocated objects first.
142    2) I want to be able to interrupt sleeping allocation with a signal.
143    Note: This is a single linked list, the next pointer is the private
144          member of struct page.
145  */
146 struct page *drbd_pp_pool;
147 spinlock_t   drbd_pp_lock;
148 int          drbd_pp_vacant;
149 wait_queue_head_t drbd_pp_wait;
150
151 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
152
153 static const struct block_device_operations drbd_ops = {
154         .owner =   THIS_MODULE,
155         .open =    drbd_open,
156         .release = drbd_release,
157 };
158
159 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
160
161 #ifdef __CHECKER__
162 /* When checking with sparse, and this is an inline function, sparse will
163    give tons of false positives. When this is a real functions sparse works.
164  */
165 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
166 {
167         int io_allowed;
168
169         atomic_inc(&mdev->local_cnt);
170         io_allowed = (mdev->state.disk >= mins);
171         if (!io_allowed) {
172                 if (atomic_dec_and_test(&mdev->local_cnt))
173                         wake_up(&mdev->misc_wait);
174         }
175         return io_allowed;
176 }
177
178 #endif
179
180 /**
181  * DOC: The transfer log
182  *
183  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
184  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
185  * of the list. There is always at least one &struct drbd_tl_epoch object.
186  *
187  * Each &struct drbd_tl_epoch has a circular double linked list of requests
188  * attached.
189  */
190 static int tl_init(struct drbd_conf *mdev)
191 {
192         struct drbd_tl_epoch *b;
193
194         /* during device minor initialization, we may well use GFP_KERNEL */
195         b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
196         if (!b)
197                 return 0;
198         INIT_LIST_HEAD(&b->requests);
199         INIT_LIST_HEAD(&b->w.list);
200         b->next = NULL;
201         b->br_number = 4711;
202         b->n_writes = 0;
203         b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204
205         mdev->oldest_tle = b;
206         mdev->newest_tle = b;
207         INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
208
209         mdev->tl_hash = NULL;
210         mdev->tl_hash_s = 0;
211
212         return 1;
213 }
214
215 static void tl_cleanup(struct drbd_conf *mdev)
216 {
217         D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
218         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
219         kfree(mdev->oldest_tle);
220         mdev->oldest_tle = NULL;
221         kfree(mdev->unused_spare_tle);
222         mdev->unused_spare_tle = NULL;
223         kfree(mdev->tl_hash);
224         mdev->tl_hash = NULL;
225         mdev->tl_hash_s = 0;
226 }
227
228 /**
229  * _tl_add_barrier() - Adds a barrier to the transfer log
230  * @mdev:       DRBD device.
231  * @new:        Barrier to be added before the current head of the TL.
232  *
233  * The caller must hold the req_lock.
234  */
235 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
236 {
237         struct drbd_tl_epoch *newest_before;
238
239         INIT_LIST_HEAD(&new->requests);
240         INIT_LIST_HEAD(&new->w.list);
241         new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242         new->next = NULL;
243         new->n_writes = 0;
244
245         newest_before = mdev->newest_tle;
246         /* never send a barrier number == 0, because that is special-cased
247          * when using TCQ for our write ordering code */
248         new->br_number = (newest_before->br_number+1) ?: 1;
249         if (mdev->newest_tle != new) {
250                 mdev->newest_tle->next = new;
251                 mdev->newest_tle = new;
252         }
253 }
254
255 /**
256  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
257  * @mdev:       DRBD device.
258  * @barrier_nr: Expected identifier of the DRBD write barrier packet.
259  * @set_size:   Expected number of requests before that barrier.
260  *
261  * In case the passed barrier_nr or set_size does not match the oldest
262  * &struct drbd_tl_epoch objects this function will cause a termination
263  * of the connection.
264  */
265 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
266                        unsigned int set_size)
267 {
268         struct drbd_tl_epoch *b, *nob; /* next old barrier */
269         struct list_head *le, *tle;
270         struct drbd_request *r;
271
272         spin_lock_irq(&mdev->req_lock);
273
274         b = mdev->oldest_tle;
275
276         /* first some paranoia code */
277         if (b == NULL) {
278                 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
279                         barrier_nr);
280                 goto bail;
281         }
282         if (b->br_number != barrier_nr) {
283                 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
284                         barrier_nr, b->br_number);
285                 goto bail;
286         }
287         if (b->n_writes != set_size) {
288                 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
289                         barrier_nr, set_size, b->n_writes);
290                 goto bail;
291         }
292
293         /* Clean up list of requests processed during current epoch */
294         list_for_each_safe(le, tle, &b->requests) {
295                 r = list_entry(le, struct drbd_request, tl_requests);
296                 _req_mod(r, barrier_acked);
297         }
298         /* There could be requests on the list waiting for completion
299            of the write to the local disk. To avoid corruptions of
300            slab's data structures we have to remove the lists head.
301
302            Also there could have been a barrier ack out of sequence, overtaking
303            the write acks - which would be a bug and violating write ordering.
304            To not deadlock in case we lose connection while such requests are
305            still pending, we need some way to find them for the
306            _req_mode(connection_lost_while_pending).
307
308            These have been list_move'd to the out_of_sequence_requests list in
309            _req_mod(, barrier_acked) above.
310            */
311         list_del_init(&b->requests);
312
313         nob = b->next;
314         if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
315                 _tl_add_barrier(mdev, b);
316                 if (nob)
317                         mdev->oldest_tle = nob;
318                 /* if nob == NULL b was the only barrier, and becomes the new
319                    barrier. Therefore mdev->oldest_tle points already to b */
320         } else {
321                 D_ASSERT(nob != NULL);
322                 mdev->oldest_tle = nob;
323                 kfree(b);
324         }
325
326         spin_unlock_irq(&mdev->req_lock);
327         dec_ap_pending(mdev);
328
329         return;
330
331 bail:
332         spin_unlock_irq(&mdev->req_lock);
333         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334 }
335
336 /**
337  * _tl_restart() - Walks the transfer log, and applies an action to all requests
338  * @mdev:       DRBD device.
339  * @what:       The action/event to perform with all request objects
340  *
341  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
342  * restart_frozen_disk_io.
343  */
344 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
345 {
346         struct drbd_tl_epoch *b, *tmp, **pn;
347         struct list_head *le, *tle, carry_reads;
348         struct drbd_request *req;
349         int rv, n_writes, n_reads;
350
351         b = mdev->oldest_tle;
352         pn = &mdev->oldest_tle;
353         while (b) {
354                 n_writes = 0;
355                 n_reads = 0;
356                 INIT_LIST_HEAD(&carry_reads);
357                 list_for_each_safe(le, tle, &b->requests) {
358                         req = list_entry(le, struct drbd_request, tl_requests);
359                         rv = _req_mod(req, what);
360
361                         n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
362                         n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
363                 }
364                 tmp = b->next;
365
366                 if (n_writes) {
367                         if (what == resend) {
368                                 b->n_writes = n_writes;
369                                 if (b->w.cb == NULL) {
370                                         b->w.cb = w_send_barrier;
371                                         inc_ap_pending(mdev);
372                                         set_bit(CREATE_BARRIER, &mdev->flags);
373                                 }
374
375                                 drbd_queue_work(&mdev->data.work, &b->w);
376                         }
377                         pn = &b->next;
378                 } else {
379                         if (n_reads)
380                                 list_add(&carry_reads, &b->requests);
381                         /* there could still be requests on that ring list,
382                          * in case local io is still pending */
383                         list_del(&b->requests);
384
385                         /* dec_ap_pending corresponding to queue_barrier.
386                          * the newest barrier may not have been queued yet,
387                          * in which case w.cb is still NULL. */
388                         if (b->w.cb != NULL)
389                                 dec_ap_pending(mdev);
390
391                         if (b == mdev->newest_tle) {
392                                 /* recycle, but reinit! */
393                                 D_ASSERT(tmp == NULL);
394                                 INIT_LIST_HEAD(&b->requests);
395                                 list_splice(&carry_reads, &b->requests);
396                                 INIT_LIST_HEAD(&b->w.list);
397                                 b->w.cb = NULL;
398                                 b->br_number = net_random();
399                                 b->n_writes = 0;
400
401                                 *pn = b;
402                                 break;
403                         }
404                         *pn = tmp;
405                         kfree(b);
406                 }
407                 b = tmp;
408                 list_splice(&carry_reads, &b->requests);
409         }
410 }
411
412
413 /**
414  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
415  * @mdev:       DRBD device.
416  *
417  * This is called after the connection to the peer was lost. The storage covered
418  * by the requests on the transfer gets marked as our of sync. Called from the
419  * receiver thread and the worker thread.
420  */
421 void tl_clear(struct drbd_conf *mdev)
422 {
423         struct list_head *le, *tle;
424         struct drbd_request *r;
425
426         spin_lock_irq(&mdev->req_lock);
427
428         _tl_restart(mdev, connection_lost_while_pending);
429
430         /* we expect this list to be empty. */
431         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
432
433         /* but just in case, clean it up anyways! */
434         list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
435                 r = list_entry(le, struct drbd_request, tl_requests);
436                 /* It would be nice to complete outside of spinlock.
437                  * But this is easier for now. */
438                 _req_mod(r, connection_lost_while_pending);
439         }
440
441         /* ensure bit indicating barrier is required is clear */
442         clear_bit(CREATE_BARRIER, &mdev->flags);
443
444         memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
445
446         spin_unlock_irq(&mdev->req_lock);
447 }
448
449 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
450 {
451         spin_lock_irq(&mdev->req_lock);
452         _tl_restart(mdev, what);
453         spin_unlock_irq(&mdev->req_lock);
454 }
455
456 /**
457  * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
458  * @mdev:       DRBD device.
459  * @os:         old (current) state.
460  * @ns:         new (wanted) state.
461  */
462 static int cl_wide_st_chg(struct drbd_conf *mdev,
463                           union drbd_state os, union drbd_state ns)
464 {
465         return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
466                  ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
467                   (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
468                   (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
469                   (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
470                 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
471                 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
472 }
473
474 int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
475                       union drbd_state mask, union drbd_state val)
476 {
477         unsigned long flags;
478         union drbd_state os, ns;
479         int rv;
480
481         spin_lock_irqsave(&mdev->req_lock, flags);
482         os = mdev->state;
483         ns.i = (os.i & ~mask.i) | val.i;
484         rv = _drbd_set_state(mdev, ns, f, NULL);
485         ns = mdev->state;
486         spin_unlock_irqrestore(&mdev->req_lock, flags);
487
488         return rv;
489 }
490
491 /**
492  * drbd_force_state() - Impose a change which happens outside our control on our state
493  * @mdev:       DRBD device.
494  * @mask:       mask of state bits to change.
495  * @val:        value of new state bits.
496  */
497 void drbd_force_state(struct drbd_conf *mdev,
498         union drbd_state mask, union drbd_state val)
499 {
500         drbd_change_state(mdev, CS_HARD, mask, val);
501 }
502
503 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
504 static int is_valid_state_transition(struct drbd_conf *,
505                                      union drbd_state, union drbd_state);
506 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
507                                        union drbd_state ns, int *warn_sync_abort);
508 int drbd_send_state_req(struct drbd_conf *,
509                         union drbd_state, union drbd_state);
510
511 static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
512                                     union drbd_state mask, union drbd_state val)
513 {
514         union drbd_state os, ns;
515         unsigned long flags;
516         int rv;
517
518         if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
519                 return SS_CW_SUCCESS;
520
521         if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
522                 return SS_CW_FAILED_BY_PEER;
523
524         rv = 0;
525         spin_lock_irqsave(&mdev->req_lock, flags);
526         os = mdev->state;
527         ns.i = (os.i & ~mask.i) | val.i;
528         ns = sanitize_state(mdev, os, ns, NULL);
529
530         if (!cl_wide_st_chg(mdev, os, ns))
531                 rv = SS_CW_NO_NEED;
532         if (!rv) {
533                 rv = is_valid_state(mdev, ns);
534                 if (rv == SS_SUCCESS) {
535                         rv = is_valid_state_transition(mdev, ns, os);
536                         if (rv == SS_SUCCESS)
537                                 rv = 0; /* cont waiting, otherwise fail. */
538                 }
539         }
540         spin_unlock_irqrestore(&mdev->req_lock, flags);
541
542         return rv;
543 }
544
545 /**
546  * drbd_req_state() - Perform an eventually cluster wide state change
547  * @mdev:       DRBD device.
548  * @mask:       mask of state bits to change.
549  * @val:        value of new state bits.
550  * @f:          flags
551  *
552  * Should not be called directly, use drbd_request_state() or
553  * _drbd_request_state().
554  */
555 static int drbd_req_state(struct drbd_conf *mdev,
556                           union drbd_state mask, union drbd_state val,
557                           enum chg_state_flags f)
558 {
559         struct completion done;
560         unsigned long flags;
561         union drbd_state os, ns;
562         int rv;
563
564         init_completion(&done);
565
566         if (f & CS_SERIALIZE)
567                 mutex_lock(&mdev->state_mutex);
568
569         spin_lock_irqsave(&mdev->req_lock, flags);
570         os = mdev->state;
571         ns.i = (os.i & ~mask.i) | val.i;
572         ns = sanitize_state(mdev, os, ns, NULL);
573
574         if (cl_wide_st_chg(mdev, os, ns)) {
575                 rv = is_valid_state(mdev, ns);
576                 if (rv == SS_SUCCESS)
577                         rv = is_valid_state_transition(mdev, ns, os);
578                 spin_unlock_irqrestore(&mdev->req_lock, flags);
579
580                 if (rv < SS_SUCCESS) {
581                         if (f & CS_VERBOSE)
582                                 print_st_err(mdev, os, ns, rv);
583                         goto abort;
584                 }
585
586                 drbd_state_lock(mdev);
587                 if (!drbd_send_state_req(mdev, mask, val)) {
588                         drbd_state_unlock(mdev);
589                         rv = SS_CW_FAILED_BY_PEER;
590                         if (f & CS_VERBOSE)
591                                 print_st_err(mdev, os, ns, rv);
592                         goto abort;
593                 }
594
595                 wait_event(mdev->state_wait,
596                         (rv = _req_st_cond(mdev, mask, val)));
597
598                 if (rv < SS_SUCCESS) {
599                         drbd_state_unlock(mdev);
600                         if (f & CS_VERBOSE)
601                                 print_st_err(mdev, os, ns, rv);
602                         goto abort;
603                 }
604                 spin_lock_irqsave(&mdev->req_lock, flags);
605                 os = mdev->state;
606                 ns.i = (os.i & ~mask.i) | val.i;
607                 rv = _drbd_set_state(mdev, ns, f, &done);
608                 drbd_state_unlock(mdev);
609         } else {
610                 rv = _drbd_set_state(mdev, ns, f, &done);
611         }
612
613         spin_unlock_irqrestore(&mdev->req_lock, flags);
614
615         if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
616                 D_ASSERT(current != mdev->worker.task);
617                 wait_for_completion(&done);
618         }
619
620 abort:
621         if (f & CS_SERIALIZE)
622                 mutex_unlock(&mdev->state_mutex);
623
624         return rv;
625 }
626
627 /**
628  * _drbd_request_state() - Request a state change (with flags)
629  * @mdev:       DRBD device.
630  * @mask:       mask of state bits to change.
631  * @val:        value of new state bits.
632  * @f:          flags
633  *
634  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
635  * flag, or when logging of failed state change requests is not desired.
636  */
637 int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
638                         union drbd_state val,   enum chg_state_flags f)
639 {
640         int rv;
641
642         wait_event(mdev->state_wait,
643                    (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
644
645         return rv;
646 }
647
648 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
649 {
650         dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
651             name,
652             drbd_conn_str(ns.conn),
653             drbd_role_str(ns.role),
654             drbd_role_str(ns.peer),
655             drbd_disk_str(ns.disk),
656             drbd_disk_str(ns.pdsk),
657             ns.susp ? 's' : 'r',
658             ns.aftr_isp ? 'a' : '-',
659             ns.peer_isp ? 'p' : '-',
660             ns.user_isp ? 'u' : '-'
661             );
662 }
663
664 void print_st_err(struct drbd_conf *mdev,
665         union drbd_state os, union drbd_state ns, int err)
666 {
667         if (err == SS_IN_TRANSIENT_STATE)
668                 return;
669         dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
670         print_st(mdev, " state", os);
671         print_st(mdev, "wanted", ns);
672 }
673
674
675 #define drbd_peer_str drbd_role_str
676 #define drbd_pdsk_str drbd_disk_str
677
678 #define drbd_susp_str(A)     ((A) ? "1" : "0")
679 #define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
680 #define drbd_peer_isp_str(A) ((A) ? "1" : "0")
681 #define drbd_user_isp_str(A) ((A) ? "1" : "0")
682
683 #define PSC(A) \
684         ({ if (ns.A != os.A) { \
685                 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
686                               drbd_##A##_str(os.A), \
687                               drbd_##A##_str(ns.A)); \
688         } })
689
690 /**
691  * is_valid_state() - Returns an SS_ error code if ns is not valid
692  * @mdev:       DRBD device.
693  * @ns:         State to consider.
694  */
695 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
696 {
697         /* See drbd_state_sw_errors in drbd_strings.c */
698
699         enum drbd_fencing_p fp;
700         int rv = SS_SUCCESS;
701
702         fp = FP_DONT_CARE;
703         if (get_ldev(mdev)) {
704                 fp = mdev->ldev->dc.fencing;
705                 put_ldev(mdev);
706         }
707
708         if (get_net_conf(mdev)) {
709                 if (!mdev->net_conf->two_primaries &&
710                     ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
711                         rv = SS_TWO_PRIMARIES;
712                 put_net_conf(mdev);
713         }
714
715         if (rv <= 0)
716                 /* already found a reason to abort */;
717         else if (ns.role == R_SECONDARY && mdev->open_cnt)
718                 rv = SS_DEVICE_IN_USE;
719
720         else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
721                 rv = SS_NO_UP_TO_DATE_DISK;
722
723         else if (fp >= FP_RESOURCE &&
724                  ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
725                 rv = SS_PRIMARY_NOP;
726
727         else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
728                 rv = SS_NO_UP_TO_DATE_DISK;
729
730         else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
731                 rv = SS_NO_LOCAL_DISK;
732
733         else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
734                 rv = SS_NO_REMOTE_DISK;
735
736         else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
737                 rv = SS_NO_UP_TO_DATE_DISK;
738
739         else if ((ns.conn == C_CONNECTED ||
740                   ns.conn == C_WF_BITMAP_S ||
741                   ns.conn == C_SYNC_SOURCE ||
742                   ns.conn == C_PAUSED_SYNC_S) &&
743                   ns.disk == D_OUTDATED)
744                 rv = SS_CONNECTED_OUTDATES;
745
746         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
747                  (mdev->sync_conf.verify_alg[0] == 0))
748                 rv = SS_NO_VERIFY_ALG;
749
750         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
751                   mdev->agreed_pro_version < 88)
752                 rv = SS_NOT_SUPPORTED;
753
754         return rv;
755 }
756
757 /**
758  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
759  * @mdev:       DRBD device.
760  * @ns:         new state.
761  * @os:         old state.
762  */
763 static int is_valid_state_transition(struct drbd_conf *mdev,
764                                      union drbd_state ns, union drbd_state os)
765 {
766         int rv = SS_SUCCESS;
767
768         if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
769             os.conn > C_CONNECTED)
770                 rv = SS_RESYNC_RUNNING;
771
772         if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
773                 rv = SS_ALREADY_STANDALONE;
774
775         if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
776                 rv = SS_IS_DISKLESS;
777
778         if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
779                 rv = SS_NO_NET_CONFIG;
780
781         if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
782                 rv = SS_LOWER_THAN_OUTDATED;
783
784         if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
785                 rv = SS_IN_TRANSIENT_STATE;
786
787         if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
788                 rv = SS_IN_TRANSIENT_STATE;
789
790         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
791                 rv = SS_NEED_CONNECTION;
792
793         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
794             ns.conn != os.conn && os.conn > C_CONNECTED)
795                 rv = SS_RESYNC_RUNNING;
796
797         if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
798             os.conn < C_CONNECTED)
799                 rv = SS_NEED_CONNECTION;
800
801         return rv;
802 }
803
804 /**
805  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
806  * @mdev:       DRBD device.
807  * @os:         old state.
808  * @ns:         new state.
809  * @warn_sync_abort:
810  *
811  * When we loose connection, we have to set the state of the peers disk (pdsk)
812  * to D_UNKNOWN. This rule and many more along those lines are in this function.
813  */
814 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
815                                        union drbd_state ns, int *warn_sync_abort)
816 {
817         enum drbd_fencing_p fp;
818
819         fp = FP_DONT_CARE;
820         if (get_ldev(mdev)) {
821                 fp = mdev->ldev->dc.fencing;
822                 put_ldev(mdev);
823         }
824
825         /* Disallow Network errors to configure a device's network part */
826         if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
827             os.conn <= C_DISCONNECTING)
828                 ns.conn = os.conn;
829
830         /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
831         if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
832             ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
833                 ns.conn = os.conn;
834
835         /* After C_DISCONNECTING only C_STANDALONE may follow */
836         if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
837                 ns.conn = os.conn;
838
839         if (ns.conn < C_CONNECTED) {
840                 ns.peer_isp = 0;
841                 ns.peer = R_UNKNOWN;
842                 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
843                         ns.pdsk = D_UNKNOWN;
844         }
845
846         /* Clear the aftr_isp when becoming unconfigured */
847         if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
848                 ns.aftr_isp = 0;
849
850         /* Abort resync if a disk fails/detaches */
851         if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
852             (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
853                 if (warn_sync_abort)
854                         *warn_sync_abort = 1;
855                 ns.conn = C_CONNECTED;
856         }
857
858         if (ns.conn >= C_CONNECTED &&
859             ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
860              (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
861                 switch (ns.conn) {
862                 case C_WF_BITMAP_T:
863                 case C_PAUSED_SYNC_T:
864                         ns.disk = D_OUTDATED;
865                         break;
866                 case C_CONNECTED:
867                 case C_WF_BITMAP_S:
868                 case C_SYNC_SOURCE:
869                 case C_PAUSED_SYNC_S:
870                         ns.disk = D_UP_TO_DATE;
871                         break;
872                 case C_SYNC_TARGET:
873                         ns.disk = D_INCONSISTENT;
874                         dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
875                         break;
876                 }
877                 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
878                         dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
879         }
880
881         if (ns.conn >= C_CONNECTED &&
882             (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
883                 switch (ns.conn) {
884                 case C_CONNECTED:
885                 case C_WF_BITMAP_T:
886                 case C_PAUSED_SYNC_T:
887                 case C_SYNC_TARGET:
888                         ns.pdsk = D_UP_TO_DATE;
889                         break;
890                 case C_WF_BITMAP_S:
891                 case C_PAUSED_SYNC_S:
892                         /* remap any consistent state to D_OUTDATED,
893                          * but disallow "upgrade" of not even consistent states.
894                          */
895                         ns.pdsk =
896                                 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
897                                 ? os.pdsk : D_OUTDATED;
898                         break;
899                 case C_SYNC_SOURCE:
900                         ns.pdsk = D_INCONSISTENT;
901                         dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
902                         break;
903                 }
904                 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
905                         dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
906         }
907
908         /* Connection breaks down before we finished "Negotiating" */
909         if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
910             get_ldev_if_state(mdev, D_NEGOTIATING)) {
911                 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
912                         ns.disk = mdev->new_state_tmp.disk;
913                         ns.pdsk = mdev->new_state_tmp.pdsk;
914                 } else {
915                         dev_alert(DEV, "Connection lost while negotiating, no data!\n");
916                         ns.disk = D_DISKLESS;
917                         ns.pdsk = D_UNKNOWN;
918                 }
919                 put_ldev(mdev);
920         }
921
922         if (fp == FP_STONITH &&
923             (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
924             !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
925                 ns.susp = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
926
927         if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
928             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
929             !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
930                 ns.susp = 1; /* Suspend IO while no data available (no accessible data available) */
931
932         if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
933                 if (ns.conn == C_SYNC_SOURCE)
934                         ns.conn = C_PAUSED_SYNC_S;
935                 if (ns.conn == C_SYNC_TARGET)
936                         ns.conn = C_PAUSED_SYNC_T;
937         } else {
938                 if (ns.conn == C_PAUSED_SYNC_S)
939                         ns.conn = C_SYNC_SOURCE;
940                 if (ns.conn == C_PAUSED_SYNC_T)
941                         ns.conn = C_SYNC_TARGET;
942         }
943
944         return ns;
945 }
946
947 /* helper for __drbd_set_state */
948 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
949 {
950         if (cs == C_VERIFY_T) {
951                 /* starting online verify from an arbitrary position
952                  * does not fit well into the existing protocol.
953                  * on C_VERIFY_T, we initialize ov_left and friends
954                  * implicitly in receive_DataRequest once the
955                  * first P_OV_REQUEST is received */
956                 mdev->ov_start_sector = ~(sector_t)0;
957         } else {
958                 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
959                 if (bit >= mdev->rs_total)
960                         mdev->ov_start_sector =
961                                 BM_BIT_TO_SECT(mdev->rs_total - 1);
962                 mdev->ov_position = mdev->ov_start_sector;
963         }
964 }
965
966 /**
967  * __drbd_set_state() - Set a new DRBD state
968  * @mdev:       DRBD device.
969  * @ns:         new state.
970  * @flags:      Flags
971  * @done:       Optional completion, that will get completed after the after_state_ch() finished
972  *
973  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
974  */
975 int __drbd_set_state(struct drbd_conf *mdev,
976                     union drbd_state ns, enum chg_state_flags flags,
977                     struct completion *done)
978 {
979         union drbd_state os;
980         int rv = SS_SUCCESS;
981         int warn_sync_abort = 0;
982         struct after_state_chg_work *ascw;
983
984         os = mdev->state;
985
986         ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
987
988         if (ns.i == os.i)
989                 return SS_NOTHING_TO_DO;
990
991         if (!(flags & CS_HARD)) {
992                 /*  pre-state-change checks ; only look at ns  */
993                 /* See drbd_state_sw_errors in drbd_strings.c */
994
995                 rv = is_valid_state(mdev, ns);
996                 if (rv < SS_SUCCESS) {
997                         /* If the old state was illegal as well, then let
998                            this happen...*/
999
1000                         if (is_valid_state(mdev, os) == rv)
1001                                 rv = is_valid_state_transition(mdev, ns, os);
1002                 } else
1003                         rv = is_valid_state_transition(mdev, ns, os);
1004         }
1005
1006         if (rv < SS_SUCCESS) {
1007                 if (flags & CS_VERBOSE)
1008                         print_st_err(mdev, os, ns, rv);
1009                 return rv;
1010         }
1011
1012         if (warn_sync_abort)
1013                 dev_warn(DEV, "Resync aborted.\n");
1014
1015         {
1016                 char *pbp, pb[300];
1017                 pbp = pb;
1018                 *pbp = 0;
1019                 PSC(role);
1020                 PSC(peer);
1021                 PSC(conn);
1022                 PSC(disk);
1023                 PSC(pdsk);
1024                 PSC(susp);
1025                 PSC(aftr_isp);
1026                 PSC(peer_isp);
1027                 PSC(user_isp);
1028                 dev_info(DEV, "%s\n", pb);
1029         }
1030
1031         /* solve the race between becoming unconfigured,
1032          * worker doing the cleanup, and
1033          * admin reconfiguring us:
1034          * on (re)configure, first set CONFIG_PENDING,
1035          * then wait for a potentially exiting worker,
1036          * start the worker, and schedule one no_op.
1037          * then proceed with configuration.
1038          */
1039         if (ns.disk == D_DISKLESS &&
1040             ns.conn == C_STANDALONE &&
1041             ns.role == R_SECONDARY &&
1042             !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1043                 set_bit(DEVICE_DYING, &mdev->flags);
1044
1045         mdev->state.i = ns.i;
1046         wake_up(&mdev->misc_wait);
1047         wake_up(&mdev->state_wait);
1048
1049         /*   post-state-change actions   */
1050         if (os.conn >= C_SYNC_SOURCE   && ns.conn <= C_CONNECTED) {
1051                 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1052                 mod_timer(&mdev->resync_timer, jiffies);
1053         }
1054
1055         /* aborted verify run. log the last position */
1056         if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1057             ns.conn < C_CONNECTED) {
1058                 mdev->ov_start_sector =
1059                         BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1060                 dev_info(DEV, "Online Verify reached sector %llu\n",
1061                         (unsigned long long)mdev->ov_start_sector);
1062         }
1063
1064         if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1065             (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1066                 dev_info(DEV, "Syncer continues.\n");
1067                 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1068                 if (ns.conn == C_SYNC_TARGET) {
1069                         if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1070                                 mod_timer(&mdev->resync_timer, jiffies);
1071                         /* This if (!test_bit) is only needed for the case
1072                            that a device that has ceased to used its timer,
1073                            i.e. it is already in drbd_resync_finished() gets
1074                            paused and resumed. */
1075                 }
1076         }
1077
1078         if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1079             (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1080                 dev_info(DEV, "Resync suspended\n");
1081                 mdev->rs_mark_time = jiffies;
1082                 if (ns.conn == C_PAUSED_SYNC_T)
1083                         set_bit(STOP_SYNC_TIMER, &mdev->flags);
1084         }
1085
1086         if (os.conn == C_CONNECTED &&
1087             (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1088                 mdev->ov_position = 0;
1089                 mdev->rs_total =
1090                 mdev->rs_mark_left = drbd_bm_bits(mdev);
1091                 if (mdev->agreed_pro_version >= 90)
1092                         set_ov_position(mdev, ns.conn);
1093                 else
1094                         mdev->ov_start_sector = 0;
1095                 mdev->ov_left = mdev->rs_total
1096                               - BM_SECT_TO_BIT(mdev->ov_position);
1097                 mdev->rs_start     =
1098                 mdev->rs_mark_time = jiffies;
1099                 mdev->ov_last_oos_size = 0;
1100                 mdev->ov_last_oos_start = 0;
1101
1102                 if (ns.conn == C_VERIFY_S) {
1103                         dev_info(DEV, "Starting Online Verify from sector %llu\n",
1104                                         (unsigned long long)mdev->ov_position);
1105                         mod_timer(&mdev->resync_timer, jiffies);
1106                 }
1107         }
1108
1109         if (get_ldev(mdev)) {
1110                 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1111                                                  MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1112                                                  MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1113
1114                 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1115                         mdf |= MDF_CRASHED_PRIMARY;
1116                 if (mdev->state.role == R_PRIMARY ||
1117                     (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1118                         mdf |= MDF_PRIMARY_IND;
1119                 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1120                         mdf |= MDF_CONNECTED_IND;
1121                 if (mdev->state.disk > D_INCONSISTENT)
1122                         mdf |= MDF_CONSISTENT;
1123                 if (mdev->state.disk > D_OUTDATED)
1124                         mdf |= MDF_WAS_UP_TO_DATE;
1125                 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1126                         mdf |= MDF_PEER_OUT_DATED;
1127                 if (mdf != mdev->ldev->md.flags) {
1128                         mdev->ldev->md.flags = mdf;
1129                         drbd_md_mark_dirty(mdev);
1130                 }
1131                 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1132                         drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1133                 put_ldev(mdev);
1134         }
1135
1136         /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1137         if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1138             os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1139                 set_bit(CONSIDER_RESYNC, &mdev->flags);
1140
1141         /* Receiver should clean up itself */
1142         if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1143                 drbd_thread_stop_nowait(&mdev->receiver);
1144
1145         /* Now the receiver finished cleaning up itself, it should die */
1146         if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1147                 drbd_thread_stop_nowait(&mdev->receiver);
1148
1149         /* Upon network failure, we need to restart the receiver. */
1150         if (os.conn > C_TEAR_DOWN &&
1151             ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1152                 drbd_thread_restart_nowait(&mdev->receiver);
1153
1154         ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1155         if (ascw) {
1156                 ascw->os = os;
1157                 ascw->ns = ns;
1158                 ascw->flags = flags;
1159                 ascw->w.cb = w_after_state_ch;
1160                 ascw->done = done;
1161                 drbd_queue_work(&mdev->data.work, &ascw->w);
1162         } else {
1163                 dev_warn(DEV, "Could not kmalloc an ascw\n");
1164         }
1165
1166         return rv;
1167 }
1168
1169 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1170 {
1171         struct after_state_chg_work *ascw =
1172                 container_of(w, struct after_state_chg_work, w);
1173         after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1174         if (ascw->flags & CS_WAIT_COMPLETE) {
1175                 D_ASSERT(ascw->done != NULL);
1176                 complete(ascw->done);
1177         }
1178         kfree(ascw);
1179
1180         return 1;
1181 }
1182
1183 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1184 {
1185         if (rv) {
1186                 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1187                 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1188                 return;
1189         }
1190
1191         switch (mdev->state.conn) {
1192         case C_STARTING_SYNC_T:
1193                 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1194                 break;
1195         case C_STARTING_SYNC_S:
1196                 drbd_start_resync(mdev, C_SYNC_SOURCE);
1197                 break;
1198         }
1199 }
1200
1201 /**
1202  * after_state_ch() - Perform after state change actions that may sleep
1203  * @mdev:       DRBD device.
1204  * @os:         old state.
1205  * @ns:         new state.
1206  * @flags:      Flags
1207  */
1208 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1209                            union drbd_state ns, enum chg_state_flags flags)
1210 {
1211         enum drbd_fencing_p fp;
1212         enum drbd_req_event what = nothing;
1213
1214         if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1215                 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1216                 if (mdev->p_uuid)
1217                         mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1218         }
1219
1220         fp = FP_DONT_CARE;
1221         if (get_ldev(mdev)) {
1222                 fp = mdev->ldev->dc.fencing;
1223                 put_ldev(mdev);
1224         }
1225
1226         /* Inform userspace about the change... */
1227         drbd_bcast_state(mdev, ns);
1228
1229         if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1230             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1231                 drbd_khelper(mdev, "pri-on-incon-degr");
1232
1233         /* Here we have the actions that are performed after a
1234            state change. This function might sleep */
1235
1236         if (os.susp && ns.susp && mdev->sync_conf.on_no_data == OND_SUSPEND_IO) {
1237                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1238                         if (ns.conn == C_CONNECTED)
1239                                 what = resend;
1240                         else /* ns.conn > C_CONNECTED */
1241                                 dev_err(DEV, "Unexpected Resynd going on!\n");
1242                 }
1243
1244                 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1245                         what = restart_frozen_disk_io;
1246         }
1247
1248         if (fp == FP_STONITH && ns.susp) {
1249                 /* case1: The outdate peer handler is successful: */
1250                 if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1251                         tl_clear(mdev);
1252                         if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1253                                 drbd_uuid_new_current(mdev);
1254                                 clear_bit(NEW_CUR_UUID, &mdev->flags);
1255                                 drbd_md_sync(mdev);
1256                         }
1257                         spin_lock_irq(&mdev->req_lock);
1258                         _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1259                         spin_unlock_irq(&mdev->req_lock);
1260                 }
1261                 /* case2: The connection was established again: */
1262                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1263                         clear_bit(NEW_CUR_UUID, &mdev->flags);
1264                         what = resend;
1265                 }
1266         }
1267
1268         if (what != nothing) {
1269                 spin_lock_irq(&mdev->req_lock);
1270                 _tl_restart(mdev, what);
1271                 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1272                 spin_unlock_irq(&mdev->req_lock);
1273         }
1274
1275         /* Do not change the order of the if above and the two below... */
1276         if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1277                 drbd_send_uuids(mdev);
1278                 drbd_send_state(mdev);
1279         }
1280         if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1281                 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1282
1283         /* Lost contact to peer's copy of the data */
1284         if ((os.pdsk >= D_INCONSISTENT &&
1285              os.pdsk != D_UNKNOWN &&
1286              os.pdsk != D_OUTDATED)
1287         &&  (ns.pdsk < D_INCONSISTENT ||
1288              ns.pdsk == D_UNKNOWN ||
1289              ns.pdsk == D_OUTDATED)) {
1290                 if (get_ldev(mdev)) {
1291                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1292                             mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1293                                 if (mdev->state.susp) {
1294                                         set_bit(NEW_CUR_UUID, &mdev->flags);
1295                                 } else {
1296                                         drbd_uuid_new_current(mdev);
1297                                         drbd_send_uuids(mdev);
1298                                 }
1299                         }
1300                         put_ldev(mdev);
1301                 }
1302         }
1303
1304         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1305                 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1306                         drbd_uuid_new_current(mdev);
1307                         drbd_send_uuids(mdev);
1308                 }
1309
1310                 /* D_DISKLESS Peer becomes secondary */
1311                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1312                         drbd_al_to_on_disk_bm(mdev);
1313                 put_ldev(mdev);
1314         }
1315
1316         /* Last part of the attaching process ... */
1317         if (ns.conn >= C_CONNECTED &&
1318             os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1319                 drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1320                 drbd_send_uuids(mdev);
1321                 drbd_send_state(mdev);
1322         }
1323
1324         /* We want to pause/continue resync, tell peer. */
1325         if (ns.conn >= C_CONNECTED &&
1326              ((os.aftr_isp != ns.aftr_isp) ||
1327               (os.user_isp != ns.user_isp)))
1328                 drbd_send_state(mdev);
1329
1330         /* In case one of the isp bits got set, suspend other devices. */
1331         if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1332             (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1333                 suspend_other_sg(mdev);
1334
1335         /* Make sure the peer gets informed about eventual state
1336            changes (ISP bits) while we were in WFReportParams. */
1337         if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1338                 drbd_send_state(mdev);
1339
1340         /* We are in the progress to start a full sync... */
1341         if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1342             (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1343                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1344
1345         /* We are invalidating our self... */
1346         if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1347             os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1348                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1349
1350         if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1351                 enum drbd_io_error_p eh;
1352
1353                 eh = EP_PASS_ON;
1354                 if (get_ldev_if_state(mdev, D_FAILED)) {
1355                         eh = mdev->ldev->dc.on_io_error;
1356                         put_ldev(mdev);
1357                 }
1358
1359                 drbd_rs_cancel_all(mdev);
1360                 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1361                    and it is D_DISKLESS here, local_cnt can only go down, it can
1362                    not increase... It will reach zero */
1363                 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1364                 mdev->rs_total = 0;
1365                 mdev->rs_failed = 0;
1366                 atomic_set(&mdev->rs_pending_cnt, 0);
1367
1368                 spin_lock_irq(&mdev->req_lock);
1369                 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1370                 spin_unlock_irq(&mdev->req_lock);
1371
1372                 if (eh == EP_CALL_HELPER)
1373                         drbd_khelper(mdev, "local-io-error");
1374         }
1375
1376         if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1377
1378                 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1379                         if (drbd_send_state(mdev))
1380                                 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1381                         else
1382                                 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1383                 }
1384
1385                 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1386                 lc_destroy(mdev->resync);
1387                 mdev->resync = NULL;
1388                 lc_destroy(mdev->act_log);
1389                 mdev->act_log = NULL;
1390                 __no_warn(local,
1391                         drbd_free_bc(mdev->ldev);
1392                         mdev->ldev = NULL;);
1393
1394                 if (mdev->md_io_tmpp)
1395                         __free_page(mdev->md_io_tmpp);
1396         }
1397
1398         /* Disks got bigger while they were detached */
1399         if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1400             test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1401                 if (ns.conn == C_CONNECTED)
1402                         resync_after_online_grow(mdev);
1403         }
1404
1405         /* A resync finished or aborted, wake paused devices... */
1406         if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1407             (os.peer_isp && !ns.peer_isp) ||
1408             (os.user_isp && !ns.user_isp))
1409                 resume_next_sg(mdev);
1410
1411         /* free tl_hash if we Got thawed and are C_STANDALONE */
1412         if (ns.conn == C_STANDALONE && ns.susp == 0 && mdev->tl_hash)
1413                 drbd_free_tl_hash(mdev);
1414
1415         /* Upon network connection, we need to start the receiver */
1416         if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1417                 drbd_thread_start(&mdev->receiver);
1418
1419         /* Terminate worker thread if we are unconfigured - it will be
1420            restarted as needed... */
1421         if (ns.disk == D_DISKLESS &&
1422             ns.conn == C_STANDALONE &&
1423             ns.role == R_SECONDARY) {
1424                 if (os.aftr_isp != ns.aftr_isp)
1425                         resume_next_sg(mdev);
1426                 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1427                 if (test_bit(DEVICE_DYING, &mdev->flags))
1428                         drbd_thread_stop_nowait(&mdev->worker);
1429         }
1430
1431         drbd_md_sync(mdev);
1432 }
1433
1434
1435 static int drbd_thread_setup(void *arg)
1436 {
1437         struct drbd_thread *thi = (struct drbd_thread *) arg;
1438         struct drbd_conf *mdev = thi->mdev;
1439         unsigned long flags;
1440         int retval;
1441
1442 restart:
1443         retval = thi->function(thi);
1444
1445         spin_lock_irqsave(&thi->t_lock, flags);
1446
1447         /* if the receiver has been "Exiting", the last thing it did
1448          * was set the conn state to "StandAlone",
1449          * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1450          * and receiver thread will be "started".
1451          * drbd_thread_start needs to set "Restarting" in that case.
1452          * t_state check and assignment needs to be within the same spinlock,
1453          * so either thread_start sees Exiting, and can remap to Restarting,
1454          * or thread_start see None, and can proceed as normal.
1455          */
1456
1457         if (thi->t_state == Restarting) {
1458                 dev_info(DEV, "Restarting %s\n", current->comm);
1459                 thi->t_state = Running;
1460                 spin_unlock_irqrestore(&thi->t_lock, flags);
1461                 goto restart;
1462         }
1463
1464         thi->task = NULL;
1465         thi->t_state = None;
1466         smp_mb();
1467         complete(&thi->stop);
1468         spin_unlock_irqrestore(&thi->t_lock, flags);
1469
1470         dev_info(DEV, "Terminating %s\n", current->comm);
1471
1472         /* Release mod reference taken when thread was started */
1473         module_put(THIS_MODULE);
1474         return retval;
1475 }
1476
1477 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1478                       int (*func) (struct drbd_thread *))
1479 {
1480         spin_lock_init(&thi->t_lock);
1481         thi->task    = NULL;
1482         thi->t_state = None;
1483         thi->function = func;
1484         thi->mdev = mdev;
1485 }
1486
1487 int drbd_thread_start(struct drbd_thread *thi)
1488 {
1489         struct drbd_conf *mdev = thi->mdev;
1490         struct task_struct *nt;
1491         unsigned long flags;
1492
1493         const char *me =
1494                 thi == &mdev->receiver ? "receiver" :
1495                 thi == &mdev->asender  ? "asender"  :
1496                 thi == &mdev->worker   ? "worker"   : "NONSENSE";
1497
1498         /* is used from state engine doing drbd_thread_stop_nowait,
1499          * while holding the req lock irqsave */
1500         spin_lock_irqsave(&thi->t_lock, flags);
1501
1502         switch (thi->t_state) {
1503         case None:
1504                 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1505                                 me, current->comm, current->pid);
1506
1507                 /* Get ref on module for thread - this is released when thread exits */
1508                 if (!try_module_get(THIS_MODULE)) {
1509                         dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1510                         spin_unlock_irqrestore(&thi->t_lock, flags);
1511                         return FALSE;
1512                 }
1513
1514                 init_completion(&thi->stop);
1515                 D_ASSERT(thi->task == NULL);
1516                 thi->reset_cpu_mask = 1;
1517                 thi->t_state = Running;
1518                 spin_unlock_irqrestore(&thi->t_lock, flags);
1519                 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1520
1521                 nt = kthread_create(drbd_thread_setup, (void *) thi,
1522                                     "drbd%d_%s", mdev_to_minor(mdev), me);
1523
1524                 if (IS_ERR(nt)) {
1525                         dev_err(DEV, "Couldn't start thread\n");
1526
1527                         module_put(THIS_MODULE);
1528                         return FALSE;
1529                 }
1530                 spin_lock_irqsave(&thi->t_lock, flags);
1531                 thi->task = nt;
1532                 thi->t_state = Running;
1533                 spin_unlock_irqrestore(&thi->t_lock, flags);
1534                 wake_up_process(nt);
1535                 break;
1536         case Exiting:
1537                 thi->t_state = Restarting;
1538                 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1539                                 me, current->comm, current->pid);
1540                 /* fall through */
1541         case Running:
1542         case Restarting:
1543         default:
1544                 spin_unlock_irqrestore(&thi->t_lock, flags);
1545                 break;
1546         }
1547
1548         return TRUE;
1549 }
1550
1551
1552 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1553 {
1554         unsigned long flags;
1555
1556         enum drbd_thread_state ns = restart ? Restarting : Exiting;
1557
1558         /* may be called from state engine, holding the req lock irqsave */
1559         spin_lock_irqsave(&thi->t_lock, flags);
1560
1561         if (thi->t_state == None) {
1562                 spin_unlock_irqrestore(&thi->t_lock, flags);
1563                 if (restart)
1564                         drbd_thread_start(thi);
1565                 return;
1566         }
1567
1568         if (thi->t_state != ns) {
1569                 if (thi->task == NULL) {
1570                         spin_unlock_irqrestore(&thi->t_lock, flags);
1571                         return;
1572                 }
1573
1574                 thi->t_state = ns;
1575                 smp_mb();
1576                 init_completion(&thi->stop);
1577                 if (thi->task != current)
1578                         force_sig(DRBD_SIGKILL, thi->task);
1579
1580         }
1581
1582         spin_unlock_irqrestore(&thi->t_lock, flags);
1583
1584         if (wait)
1585                 wait_for_completion(&thi->stop);
1586 }
1587
1588 #ifdef CONFIG_SMP
1589 /**
1590  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1591  * @mdev:       DRBD device.
1592  *
1593  * Forces all threads of a device onto the same CPU. This is beneficial for
1594  * DRBD's performance. May be overwritten by user's configuration.
1595  */
1596 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1597 {
1598         int ord, cpu;
1599
1600         /* user override. */
1601         if (cpumask_weight(mdev->cpu_mask))
1602                 return;
1603
1604         ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1605         for_each_online_cpu(cpu) {
1606                 if (ord-- == 0) {
1607                         cpumask_set_cpu(cpu, mdev->cpu_mask);
1608                         return;
1609                 }
1610         }
1611         /* should not be reached */
1612         cpumask_setall(mdev->cpu_mask);
1613 }
1614
1615 /**
1616  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1617  * @mdev:       DRBD device.
1618  *
1619  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1620  * prematurely.
1621  */
1622 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1623 {
1624         struct task_struct *p = current;
1625         struct drbd_thread *thi =
1626                 p == mdev->asender.task  ? &mdev->asender  :
1627                 p == mdev->receiver.task ? &mdev->receiver :
1628                 p == mdev->worker.task   ? &mdev->worker   :
1629                 NULL;
1630         ERR_IF(thi == NULL)
1631                 return;
1632         if (!thi->reset_cpu_mask)
1633                 return;
1634         thi->reset_cpu_mask = 0;
1635         set_cpus_allowed_ptr(p, mdev->cpu_mask);
1636 }
1637 #endif
1638
1639 /* the appropriate socket mutex must be held already */
1640 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1641                           enum drbd_packets cmd, struct p_header *h,
1642                           size_t size, unsigned msg_flags)
1643 {
1644         int sent, ok;
1645
1646         ERR_IF(!h) return FALSE;
1647         ERR_IF(!size) return FALSE;
1648
1649         h->magic   = BE_DRBD_MAGIC;
1650         h->command = cpu_to_be16(cmd);
1651         h->length  = cpu_to_be16(size-sizeof(struct p_header));
1652
1653         sent = drbd_send(mdev, sock, h, size, msg_flags);
1654
1655         ok = (sent == size);
1656         if (!ok)
1657                 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1658                     cmdname(cmd), (int)size, sent);
1659         return ok;
1660 }
1661
1662 /* don't pass the socket. we may only look at it
1663  * when we hold the appropriate socket mutex.
1664  */
1665 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1666                   enum drbd_packets cmd, struct p_header *h, size_t size)
1667 {
1668         int ok = 0;
1669         struct socket *sock;
1670
1671         if (use_data_socket) {
1672                 mutex_lock(&mdev->data.mutex);
1673                 sock = mdev->data.socket;
1674         } else {
1675                 mutex_lock(&mdev->meta.mutex);
1676                 sock = mdev->meta.socket;
1677         }
1678
1679         /* drbd_disconnect() could have called drbd_free_sock()
1680          * while we were waiting in down()... */
1681         if (likely(sock != NULL))
1682                 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1683
1684         if (use_data_socket)
1685                 mutex_unlock(&mdev->data.mutex);
1686         else
1687                 mutex_unlock(&mdev->meta.mutex);
1688         return ok;
1689 }
1690
1691 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1692                    size_t size)
1693 {
1694         struct p_header h;
1695         int ok;
1696
1697         h.magic   = BE_DRBD_MAGIC;
1698         h.command = cpu_to_be16(cmd);
1699         h.length  = cpu_to_be16(size);
1700
1701         if (!drbd_get_data_sock(mdev))
1702                 return 0;
1703
1704         ok = (sizeof(h) ==
1705                 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1706         ok = ok && (size ==
1707                 drbd_send(mdev, mdev->data.socket, data, size, 0));
1708
1709         drbd_put_data_sock(mdev);
1710
1711         return ok;
1712 }
1713
1714 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1715 {
1716         struct p_rs_param_89 *p;
1717         struct socket *sock;
1718         int size, rv;
1719         const int apv = mdev->agreed_pro_version;
1720
1721         size = apv <= 87 ? sizeof(struct p_rs_param)
1722                 : apv == 88 ? sizeof(struct p_rs_param)
1723                         + strlen(mdev->sync_conf.verify_alg) + 1
1724                 : /* 89 */    sizeof(struct p_rs_param_89);
1725
1726         /* used from admin command context and receiver/worker context.
1727          * to avoid kmalloc, grab the socket right here,
1728          * then use the pre-allocated sbuf there */
1729         mutex_lock(&mdev->data.mutex);
1730         sock = mdev->data.socket;
1731
1732         if (likely(sock != NULL)) {
1733                 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1734
1735                 p = &mdev->data.sbuf.rs_param_89;
1736
1737                 /* initialize verify_alg and csums_alg */
1738                 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1739
1740                 p->rate = cpu_to_be32(sc->rate);
1741
1742                 if (apv >= 88)
1743                         strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1744                 if (apv >= 89)
1745                         strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1746
1747                 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1748         } else
1749                 rv = 0; /* not ok */
1750
1751         mutex_unlock(&mdev->data.mutex);
1752
1753         return rv;
1754 }
1755
1756 int drbd_send_protocol(struct drbd_conf *mdev)
1757 {
1758         struct p_protocol *p;
1759         int size, cf, rv;
1760
1761         size = sizeof(struct p_protocol);
1762
1763         if (mdev->agreed_pro_version >= 87)
1764                 size += strlen(mdev->net_conf->integrity_alg) + 1;
1765
1766         /* we must not recurse into our own queue,
1767          * as that is blocked during handshake */
1768         p = kmalloc(size, GFP_NOIO);
1769         if (p == NULL)
1770                 return 0;
1771
1772         p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1773         p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1774         p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1775         p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1776         p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1777
1778         cf = 0;
1779         if (mdev->net_conf->want_lose)
1780                 cf |= CF_WANT_LOSE;
1781         if (mdev->net_conf->dry_run) {
1782                 if (mdev->agreed_pro_version >= 92)
1783                         cf |= CF_DRY_RUN;
1784                 else {
1785                         dev_err(DEV, "--dry-run is not supported by peer");
1786                         kfree(p);
1787                         return 0;
1788                 }
1789         }
1790         p->conn_flags    = cpu_to_be32(cf);
1791
1792         if (mdev->agreed_pro_version >= 87)
1793                 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1794
1795         rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1796                            (struct p_header *)p, size);
1797         kfree(p);
1798         return rv;
1799 }
1800
1801 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1802 {
1803         struct p_uuids p;
1804         int i;
1805
1806         if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1807                 return 1;
1808
1809         for (i = UI_CURRENT; i < UI_SIZE; i++)
1810                 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1811
1812         mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1813         p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1814         uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1815         uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1816         uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1817         p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1818
1819         put_ldev(mdev);
1820
1821         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1822                              (struct p_header *)&p, sizeof(p));
1823 }
1824
1825 int drbd_send_uuids(struct drbd_conf *mdev)
1826 {
1827         return _drbd_send_uuids(mdev, 0);
1828 }
1829
1830 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1831 {
1832         return _drbd_send_uuids(mdev, 8);
1833 }
1834
1835
1836 int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1837 {
1838         struct p_rs_uuid p;
1839
1840         p.uuid = cpu_to_be64(val);
1841
1842         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1843                              (struct p_header *)&p, sizeof(p));
1844 }
1845
1846 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1847 {
1848         struct p_sizes p;
1849         sector_t d_size, u_size;
1850         int q_order_type;
1851         int ok;
1852
1853         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1854                 D_ASSERT(mdev->ldev->backing_bdev);
1855                 d_size = drbd_get_max_capacity(mdev->ldev);
1856                 u_size = mdev->ldev->dc.disk_size;
1857                 q_order_type = drbd_queue_order_type(mdev);
1858                 put_ldev(mdev);
1859         } else {
1860                 d_size = 0;
1861                 u_size = 0;
1862                 q_order_type = QUEUE_ORDERED_NONE;
1863         }
1864
1865         p.d_size = cpu_to_be64(d_size);
1866         p.u_size = cpu_to_be64(u_size);
1867         p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1868         p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1869         p.queue_order_type = cpu_to_be16(q_order_type);
1870         p.dds_flags = cpu_to_be16(flags);
1871
1872         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1873                            (struct p_header *)&p, sizeof(p));
1874         return ok;
1875 }
1876
1877 /**
1878  * drbd_send_state() - Sends the drbd state to the peer
1879  * @mdev:       DRBD device.
1880  */
1881 int drbd_send_state(struct drbd_conf *mdev)
1882 {
1883         struct socket *sock;
1884         struct p_state p;
1885         int ok = 0;
1886
1887         /* Grab state lock so we wont send state if we're in the middle
1888          * of a cluster wide state change on another thread */
1889         drbd_state_lock(mdev);
1890
1891         mutex_lock(&mdev->data.mutex);
1892
1893         p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1894         sock = mdev->data.socket;
1895
1896         if (likely(sock != NULL)) {
1897                 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1898                                     (struct p_header *)&p, sizeof(p), 0);
1899         }
1900
1901         mutex_unlock(&mdev->data.mutex);
1902
1903         drbd_state_unlock(mdev);
1904         return ok;
1905 }
1906
1907 int drbd_send_state_req(struct drbd_conf *mdev,
1908         union drbd_state mask, union drbd_state val)
1909 {
1910         struct p_req_state p;
1911
1912         p.mask    = cpu_to_be32(mask.i);
1913         p.val     = cpu_to_be32(val.i);
1914
1915         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1916                              (struct p_header *)&p, sizeof(p));
1917 }
1918
1919 int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1920 {
1921         struct p_req_state_reply p;
1922
1923         p.retcode    = cpu_to_be32(retcode);
1924
1925         return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1926                              (struct p_header *)&p, sizeof(p));
1927 }
1928
1929 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1930         struct p_compressed_bm *p,
1931         struct bm_xfer_ctx *c)
1932 {
1933         struct bitstream bs;
1934         unsigned long plain_bits;
1935         unsigned long tmp;
1936         unsigned long rl;
1937         unsigned len;
1938         unsigned toggle;
1939         int bits;
1940
1941         /* may we use this feature? */
1942         if ((mdev->sync_conf.use_rle == 0) ||
1943                 (mdev->agreed_pro_version < 90))
1944                         return 0;
1945
1946         if (c->bit_offset >= c->bm_bits)
1947                 return 0; /* nothing to do. */
1948
1949         /* use at most thus many bytes */
1950         bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1951         memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1952         /* plain bits covered in this code string */
1953         plain_bits = 0;
1954
1955         /* p->encoding & 0x80 stores whether the first run length is set.
1956          * bit offset is implicit.
1957          * start with toggle == 2 to be able to tell the first iteration */
1958         toggle = 2;
1959
1960         /* see how much plain bits we can stuff into one packet
1961          * using RLE and VLI. */
1962         do {
1963                 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1964                                     : _drbd_bm_find_next(mdev, c->bit_offset);
1965                 if (tmp == -1UL)
1966                         tmp = c->bm_bits;
1967                 rl = tmp - c->bit_offset;
1968
1969                 if (toggle == 2) { /* first iteration */
1970                         if (rl == 0) {
1971                                 /* the first checked bit was set,
1972                                  * store start value, */
1973                                 DCBP_set_start(p, 1);
1974                                 /* but skip encoding of zero run length */
1975                                 toggle = !toggle;
1976                                 continue;
1977                         }
1978                         DCBP_set_start(p, 0);
1979                 }
1980
1981                 /* paranoia: catch zero runlength.
1982                  * can only happen if bitmap is modified while we scan it. */
1983                 if (rl == 0) {
1984                         dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1985                             "t:%u bo:%lu\n", toggle, c->bit_offset);
1986                         return -1;
1987                 }
1988
1989                 bits = vli_encode_bits(&bs, rl);
1990                 if (bits == -ENOBUFS) /* buffer full */
1991                         break;
1992                 if (bits <= 0) {
1993                         dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1994                         return 0;
1995                 }
1996
1997                 toggle = !toggle;
1998                 plain_bits += rl;
1999                 c->bit_offset = tmp;
2000         } while (c->bit_offset < c->bm_bits);
2001
2002         len = bs.cur.b - p->code + !!bs.cur.bit;
2003
2004         if (plain_bits < (len << 3)) {
2005                 /* incompressible with this method.
2006                  * we need to rewind both word and bit position. */
2007                 c->bit_offset -= plain_bits;
2008                 bm_xfer_ctx_bit_to_word_offset(c);
2009                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2010                 return 0;
2011         }
2012
2013         /* RLE + VLI was able to compress it just fine.
2014          * update c->word_offset. */
2015         bm_xfer_ctx_bit_to_word_offset(c);
2016
2017         /* store pad_bits */
2018         DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2019
2020         return len;
2021 }
2022
2023 enum { OK, FAILED, DONE }
2024 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2025         struct p_header *h, struct bm_xfer_ctx *c)
2026 {
2027         struct p_compressed_bm *p = (void*)h;
2028         unsigned long num_words;
2029         int len;
2030         int ok;
2031
2032         len = fill_bitmap_rle_bits(mdev, p, c);
2033
2034         if (len < 0)
2035                 return FAILED;
2036
2037         if (len) {
2038                 DCBP_set_code(p, RLE_VLI_Bits);
2039                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2040                         sizeof(*p) + len, 0);
2041
2042                 c->packets[0]++;
2043                 c->bytes[0] += sizeof(*p) + len;
2044
2045                 if (c->bit_offset >= c->bm_bits)
2046                         len = 0; /* DONE */
2047         } else {
2048                 /* was not compressible.
2049                  * send a buffer full of plain text bits instead. */
2050                 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2051                 len = num_words * sizeof(long);
2052                 if (len)
2053                         drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2054                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2055                                    h, sizeof(struct p_header) + len, 0);
2056                 c->word_offset += num_words;
2057                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2058
2059                 c->packets[1]++;
2060                 c->bytes[1] += sizeof(struct p_header) + len;
2061
2062                 if (c->bit_offset > c->bm_bits)
2063                         c->bit_offset = c->bm_bits;
2064         }
2065         ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2066
2067         if (ok == DONE)
2068                 INFO_bm_xfer_stats(mdev, "send", c);
2069         return ok;
2070 }
2071
2072 /* See the comment at receive_bitmap() */
2073 int _drbd_send_bitmap(struct drbd_conf *mdev)
2074 {
2075         struct bm_xfer_ctx c;
2076         struct p_header *p;
2077         int ret;
2078
2079         ERR_IF(!mdev->bitmap) return FALSE;
2080
2081         /* maybe we should use some per thread scratch page,
2082          * and allocate that during initial device creation? */
2083         p = (struct p_header *) __get_free_page(GFP_NOIO);
2084         if (!p) {
2085                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2086                 return FALSE;
2087         }
2088
2089         if (get_ldev(mdev)) {
2090                 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2091                         dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2092                         drbd_bm_set_all(mdev);
2093                         if (drbd_bm_write(mdev)) {
2094                                 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2095                                  * but otherwise process as per normal - need to tell other
2096                                  * side that a full resync is required! */
2097                                 dev_err(DEV, "Failed to write bitmap to disk!\n");
2098                         } else {
2099                                 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2100                                 drbd_md_sync(mdev);
2101                         }
2102                 }
2103                 put_ldev(mdev);
2104         }
2105
2106         c = (struct bm_xfer_ctx) {
2107                 .bm_bits = drbd_bm_bits(mdev),
2108                 .bm_words = drbd_bm_words(mdev),
2109         };
2110
2111         do {
2112                 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2113         } while (ret == OK);
2114
2115         free_page((unsigned long) p);
2116         return (ret == DONE);
2117 }
2118
2119 int drbd_send_bitmap(struct drbd_conf *mdev)
2120 {
2121         int err;
2122
2123         if (!drbd_get_data_sock(mdev))
2124                 return -1;
2125         err = !_drbd_send_bitmap(mdev);
2126         drbd_put_data_sock(mdev);
2127         return err;
2128 }
2129
2130 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2131 {
2132         int ok;
2133         struct p_barrier_ack p;
2134
2135         p.barrier  = barrier_nr;
2136         p.set_size = cpu_to_be32(set_size);
2137
2138         if (mdev->state.conn < C_CONNECTED)
2139                 return FALSE;
2140         ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2141                         (struct p_header *)&p, sizeof(p));
2142         return ok;
2143 }
2144
2145 /**
2146  * _drbd_send_ack() - Sends an ack packet
2147  * @mdev:       DRBD device.
2148  * @cmd:        Packet command code.
2149  * @sector:     sector, needs to be in big endian byte order
2150  * @blksize:    size in byte, needs to be in big endian byte order
2151  * @block_id:   Id, big endian byte order
2152  */
2153 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2154                           u64 sector,
2155                           u32 blksize,
2156                           u64 block_id)
2157 {
2158         int ok;
2159         struct p_block_ack p;
2160
2161         p.sector   = sector;
2162         p.block_id = block_id;
2163         p.blksize  = blksize;
2164         p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2165
2166         if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2167                 return FALSE;
2168         ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2169                                 (struct p_header *)&p, sizeof(p));
2170         return ok;
2171 }
2172
2173 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2174                      struct p_data *dp)
2175 {
2176         const int header_size = sizeof(struct p_data)
2177                               - sizeof(struct p_header);
2178         int data_size  = ((struct p_header *)dp)->length - header_size;
2179
2180         return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2181                               dp->block_id);
2182 }
2183
2184 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2185                      struct p_block_req *rp)
2186 {
2187         return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2188 }
2189
2190 /**
2191  * drbd_send_ack() - Sends an ack packet
2192  * @mdev:       DRBD device.
2193  * @cmd:        Packet command code.
2194  * @e:          Epoch entry.
2195  */
2196 int drbd_send_ack(struct drbd_conf *mdev,
2197         enum drbd_packets cmd, struct drbd_epoch_entry *e)
2198 {
2199         return _drbd_send_ack(mdev, cmd,
2200                               cpu_to_be64(e->sector),
2201                               cpu_to_be32(e->size),
2202                               e->block_id);
2203 }
2204
2205 /* This function misuses the block_id field to signal if the blocks
2206  * are is sync or not. */
2207 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2208                      sector_t sector, int blksize, u64 block_id)
2209 {
2210         return _drbd_send_ack(mdev, cmd,
2211                               cpu_to_be64(sector),
2212                               cpu_to_be32(blksize),
2213                               cpu_to_be64(block_id));
2214 }
2215
2216 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2217                        sector_t sector, int size, u64 block_id)
2218 {
2219         int ok;
2220         struct p_block_req p;
2221
2222         p.sector   = cpu_to_be64(sector);
2223         p.block_id = block_id;
2224         p.blksize  = cpu_to_be32(size);
2225
2226         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2227                                 (struct p_header *)&p, sizeof(p));
2228         return ok;
2229 }
2230
2231 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2232                             sector_t sector, int size,
2233                             void *digest, int digest_size,
2234                             enum drbd_packets cmd)
2235 {
2236         int ok;
2237         struct p_block_req p;
2238
2239         p.sector   = cpu_to_be64(sector);
2240         p.block_id = BE_DRBD_MAGIC + 0xbeef;
2241         p.blksize  = cpu_to_be32(size);
2242
2243         p.head.magic   = BE_DRBD_MAGIC;
2244         p.head.command = cpu_to_be16(cmd);
2245         p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2246
2247         mutex_lock(&mdev->data.mutex);
2248
2249         ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2250         ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2251
2252         mutex_unlock(&mdev->data.mutex);
2253
2254         return ok;
2255 }
2256
2257 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2258 {
2259         int ok;
2260         struct p_block_req p;
2261
2262         p.sector   = cpu_to_be64(sector);
2263         p.block_id = BE_DRBD_MAGIC + 0xbabe;
2264         p.blksize  = cpu_to_be32(size);
2265
2266         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2267                            (struct p_header *)&p, sizeof(p));
2268         return ok;
2269 }
2270
2271 /* called on sndtimeo
2272  * returns FALSE if we should retry,
2273  * TRUE if we think connection is dead
2274  */
2275 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2276 {
2277         int drop_it;
2278         /* long elapsed = (long)(jiffies - mdev->last_received); */
2279
2280         drop_it =   mdev->meta.socket == sock
2281                 || !mdev->asender.task
2282                 || get_t_state(&mdev->asender) != Running
2283                 || mdev->state.conn < C_CONNECTED;
2284
2285         if (drop_it)
2286                 return TRUE;
2287
2288         drop_it = !--mdev->ko_count;
2289         if (!drop_it) {
2290                 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2291                        current->comm, current->pid, mdev->ko_count);
2292                 request_ping(mdev);
2293         }
2294
2295         return drop_it; /* && (mdev->state == R_PRIMARY) */;
2296 }
2297
2298 /* The idea of sendpage seems to be to put some kind of reference
2299  * to the page into the skb, and to hand it over to the NIC. In
2300  * this process get_page() gets called.
2301  *
2302  * As soon as the page was really sent over the network put_page()
2303  * gets called by some part of the network layer. [ NIC driver? ]
2304  *
2305  * [ get_page() / put_page() increment/decrement the count. If count
2306  *   reaches 0 the page will be freed. ]
2307  *
2308  * This works nicely with pages from FSs.
2309  * But this means that in protocol A we might signal IO completion too early!
2310  *
2311  * In order not to corrupt data during a resync we must make sure
2312  * that we do not reuse our own buffer pages (EEs) to early, therefore
2313  * we have the net_ee list.
2314  *
2315  * XFS seems to have problems, still, it submits pages with page_count == 0!
2316  * As a workaround, we disable sendpage on pages
2317  * with page_count == 0 or PageSlab.
2318  */
2319 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2320                    int offset, size_t size, unsigned msg_flags)
2321 {
2322         int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2323         kunmap(page);
2324         if (sent == size)
2325                 mdev->send_cnt += size>>9;
2326         return sent == size;
2327 }
2328
2329 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2330                     int offset, size_t size, unsigned msg_flags)
2331 {
2332         mm_segment_t oldfs = get_fs();
2333         int sent, ok;
2334         int len = size;
2335
2336         /* e.g. XFS meta- & log-data is in slab pages, which have a
2337          * page_count of 0 and/or have PageSlab() set.
2338          * we cannot use send_page for those, as that does get_page();
2339          * put_page(); and would cause either a VM_BUG directly, or
2340          * __page_cache_release a page that would actually still be referenced
2341          * by someone, leading to some obscure delayed Oops somewhere else. */
2342         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2343                 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2344
2345         msg_flags |= MSG_NOSIGNAL;
2346         drbd_update_congested(mdev);
2347         set_fs(KERNEL_DS);
2348         do {
2349                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2350                                                         offset, len,
2351                                                         msg_flags);
2352                 if (sent == -EAGAIN) {
2353                         if (we_should_drop_the_connection(mdev,
2354                                                           mdev->data.socket))
2355                                 break;
2356                         else
2357                                 continue;
2358                 }
2359                 if (sent <= 0) {
2360                         dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2361                              __func__, (int)size, len, sent);
2362                         break;
2363                 }
2364                 len    -= sent;
2365                 offset += sent;
2366         } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2367         set_fs(oldfs);
2368         clear_bit(NET_CONGESTED, &mdev->flags);
2369
2370         ok = (len == 0);
2371         if (likely(ok))
2372                 mdev->send_cnt += size>>9;
2373         return ok;
2374 }
2375
2376 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2377 {
2378         struct bio_vec *bvec;
2379         int i;
2380         /* hint all but last page with MSG_MORE */
2381         __bio_for_each_segment(bvec, bio, i, 0) {
2382                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2383                                      bvec->bv_offset, bvec->bv_len,
2384                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2385                         return 0;
2386         }
2387         return 1;
2388 }
2389
2390 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2391 {
2392         struct bio_vec *bvec;
2393         int i;
2394         /* hint all but last page with MSG_MORE */
2395         __bio_for_each_segment(bvec, bio, i, 0) {
2396                 if (!_drbd_send_page(mdev, bvec->bv_page,
2397                                      bvec->bv_offset, bvec->bv_len,
2398                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2399                         return 0;
2400         }
2401         return 1;
2402 }
2403
2404 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2405 {
2406         struct page *page = e->pages;
2407         unsigned len = e->size;
2408         /* hint all but last page with MSG_MORE */
2409         page_chain_for_each(page) {
2410                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2411                 if (!_drbd_send_page(mdev, page, 0, l,
2412                                 page_chain_next(page) ? MSG_MORE : 0))
2413                         return 0;
2414                 len -= l;
2415         }
2416         return 1;
2417 }
2418
2419 /* Used to send write requests
2420  * R_PRIMARY -> Peer    (P_DATA)
2421  */
2422 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2423 {
2424         int ok = 1;
2425         struct p_data p;
2426         unsigned int dp_flags = 0;
2427         void *dgb;
2428         int dgs;
2429
2430         if (!drbd_get_data_sock(mdev))
2431                 return 0;
2432
2433         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2434                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2435
2436         p.head.magic   = BE_DRBD_MAGIC;
2437         p.head.command = cpu_to_be16(P_DATA);
2438         p.head.length  =
2439                 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2440
2441         p.sector   = cpu_to_be64(req->sector);
2442         p.block_id = (unsigned long)req;
2443         p.seq_num  = cpu_to_be32(req->seq_num =
2444                                  atomic_add_return(1, &mdev->packet_seq));
2445         dp_flags = 0;
2446
2447         /* NOTE: no need to check if barriers supported here as we would
2448          *       not pass the test in make_request_common in that case
2449          */
2450         if (req->master_bio->bi_rw & REQ_HARDBARRIER) {
2451                 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2452                 /* dp_flags |= DP_HARDBARRIER; */
2453         }
2454         if (req->master_bio->bi_rw & REQ_SYNC)
2455                 dp_flags |= DP_RW_SYNC;
2456         /* for now handle SYNCIO and UNPLUG
2457          * as if they still were one and the same flag */
2458         if (req->master_bio->bi_rw & REQ_UNPLUG)
2459                 dp_flags |= DP_RW_SYNC;
2460         if (mdev->state.conn >= C_SYNC_SOURCE &&
2461             mdev->state.conn <= C_PAUSED_SYNC_T)
2462                 dp_flags |= DP_MAY_SET_IN_SYNC;
2463
2464         p.dp_flags = cpu_to_be32(dp_flags);
2465         set_bit(UNPLUG_REMOTE, &mdev->flags);
2466         ok = (sizeof(p) ==
2467                 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2468         if (ok && dgs) {
2469                 dgb = mdev->int_dig_out;
2470                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2471                 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2472         }
2473         if (ok) {
2474                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2475                         ok = _drbd_send_bio(mdev, req->master_bio);
2476                 else
2477                         ok = _drbd_send_zc_bio(mdev, req->master_bio);
2478         }
2479
2480         drbd_put_data_sock(mdev);
2481
2482         return ok;
2483 }
2484
2485 /* answer packet, used to send data back for read requests:
2486  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2487  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2488  */
2489 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2490                     struct drbd_epoch_entry *e)
2491 {
2492         int ok;
2493         struct p_data p;
2494         void *dgb;
2495         int dgs;
2496
2497         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2498                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2499
2500         p.head.magic   = BE_DRBD_MAGIC;
2501         p.head.command = cpu_to_be16(cmd);
2502         p.head.length  =
2503                 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2504
2505         p.sector   = cpu_to_be64(e->sector);
2506         p.block_id = e->block_id;
2507         /* p.seq_num  = 0;    No sequence numbers here.. */
2508
2509         /* Only called by our kernel thread.
2510          * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2511          * in response to admin command or module unload.
2512          */
2513         if (!drbd_get_data_sock(mdev))
2514                 return 0;
2515
2516         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2517                                         sizeof(p), dgs ? MSG_MORE : 0);
2518         if (ok && dgs) {
2519                 dgb = mdev->int_dig_out;
2520                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2521                 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2522         }
2523         if (ok)
2524                 ok = _drbd_send_zc_ee(mdev, e);
2525
2526         drbd_put_data_sock(mdev);
2527
2528         return ok;
2529 }
2530
2531 /*
2532   drbd_send distinguishes two cases:
2533
2534   Packets sent via the data socket "sock"
2535   and packets sent via the meta data socket "msock"
2536
2537                     sock                      msock
2538   -----------------+-------------------------+------------------------------
2539   timeout           conf.timeout / 2          conf.timeout / 2
2540   timeout action    send a ping via msock     Abort communication
2541                                               and close all sockets
2542 */
2543
2544 /*
2545  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2546  */
2547 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2548               void *buf, size_t size, unsigned msg_flags)
2549 {
2550         struct kvec iov;
2551         struct msghdr msg;
2552         int rv, sent = 0;
2553
2554         if (!sock)
2555                 return -1000;
2556
2557         /* THINK  if (signal_pending) return ... ? */
2558
2559         iov.iov_base = buf;
2560         iov.iov_len  = size;
2561
2562         msg.msg_name       = NULL;
2563         msg.msg_namelen    = 0;
2564         msg.msg_control    = NULL;
2565         msg.msg_controllen = 0;
2566         msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2567
2568         if (sock == mdev->data.socket) {
2569                 mdev->ko_count = mdev->net_conf->ko_count;
2570                 drbd_update_congested(mdev);
2571         }
2572         do {
2573                 /* STRANGE
2574                  * tcp_sendmsg does _not_ use its size parameter at all ?
2575                  *
2576                  * -EAGAIN on timeout, -EINTR on signal.
2577                  */
2578 /* THINK
2579  * do we need to block DRBD_SIG if sock == &meta.socket ??
2580  * otherwise wake_asender() might interrupt some send_*Ack !
2581  */
2582                 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2583                 if (rv == -EAGAIN) {
2584                         if (we_should_drop_the_connection(mdev, sock))
2585                                 break;
2586                         else
2587                                 continue;
2588                 }
2589                 D_ASSERT(rv != 0);
2590                 if (rv == -EINTR) {
2591                         flush_signals(current);
2592                         rv = 0;
2593                 }
2594                 if (rv < 0)
2595                         break;
2596                 sent += rv;
2597                 iov.iov_base += rv;
2598                 iov.iov_len  -= rv;
2599         } while (sent < size);
2600
2601         if (sock == mdev->data.socket)
2602                 clear_bit(NET_CONGESTED, &mdev->flags);
2603
2604         if (rv <= 0) {
2605                 if (rv != -EAGAIN) {
2606                         dev_err(DEV, "%s_sendmsg returned %d\n",
2607                             sock == mdev->meta.socket ? "msock" : "sock",
2608                             rv);
2609                         drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2610                 } else
2611                         drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2612         }
2613
2614         return sent;
2615 }
2616
2617 static int drbd_open(struct block_device *bdev, fmode_t mode)
2618 {
2619         struct drbd_conf *mdev = bdev->bd_disk->private_data;
2620         unsigned long flags;
2621         int rv = 0;
2622
2623         lock_kernel();
2624         spin_lock_irqsave(&mdev->req_lock, flags);
2625         /* to have a stable mdev->state.role
2626          * and no race with updating open_cnt */
2627
2628         if (mdev->state.role != R_PRIMARY) {
2629                 if (mode & FMODE_WRITE)
2630                         rv = -EROFS;
2631                 else if (!allow_oos)
2632                         rv = -EMEDIUMTYPE;
2633         }
2634
2635         if (!rv)
2636                 mdev->open_cnt++;
2637         spin_unlock_irqrestore(&mdev->req_lock, flags);
2638         unlock_kernel();
2639
2640         return rv;
2641 }
2642
2643 static int drbd_release(struct gendisk *gd, fmode_t mode)
2644 {
2645         struct drbd_conf *mdev = gd->private_data;
2646         lock_kernel();
2647         mdev->open_cnt--;
2648         unlock_kernel();
2649         return 0;
2650 }
2651
2652 static void drbd_unplug_fn(struct request_queue *q)
2653 {
2654         struct drbd_conf *mdev = q->queuedata;
2655
2656         /* unplug FIRST */
2657         spin_lock_irq(q->queue_lock);
2658         blk_remove_plug(q);
2659         spin_unlock_irq(q->queue_lock);
2660
2661         /* only if connected */
2662         spin_lock_irq(&mdev->req_lock);
2663         if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2664                 D_ASSERT(mdev->state.role == R_PRIMARY);
2665                 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2666                         /* add to the data.work queue,
2667                          * unless already queued.
2668                          * XXX this might be a good addition to drbd_queue_work
2669                          * anyways, to detect "double queuing" ... */
2670                         if (list_empty(&mdev->unplug_work.list))
2671                                 drbd_queue_work(&mdev->data.work,
2672                                                 &mdev->unplug_work);
2673                 }
2674         }
2675         spin_unlock_irq(&mdev->req_lock);
2676
2677         if (mdev->state.disk >= D_INCONSISTENT)
2678                 drbd_kick_lo(mdev);
2679 }
2680
2681 static void drbd_set_defaults(struct drbd_conf *mdev)
2682 {
2683         /* This way we get a compile error when sync_conf grows,
2684            and we forgot to initialize it here */
2685         mdev->sync_conf = (struct syncer_conf) {
2686                 /* .rate = */           DRBD_RATE_DEF,
2687                 /* .after = */          DRBD_AFTER_DEF,
2688                 /* .al_extents = */     DRBD_AL_EXTENTS_DEF,
2689                 /* .verify_alg = */     {}, 0,
2690                 /* .cpu_mask = */       {}, 0,
2691                 /* .csums_alg = */      {}, 0,
2692                 /* .use_rle = */        0
2693         };
2694
2695         /* Have to use that way, because the layout differs between
2696            big endian and little endian */
2697         mdev->state = (union drbd_state) {
2698                 { .role = R_SECONDARY,
2699                   .peer = R_UNKNOWN,
2700                   .conn = C_STANDALONE,
2701                   .disk = D_DISKLESS,
2702                   .pdsk = D_UNKNOWN,
2703                   .susp = 0
2704                 } };
2705 }
2706
2707 void drbd_init_set_defaults(struct drbd_conf *mdev)
2708 {
2709         /* the memset(,0,) did most of this.
2710          * note: only assignments, no allocation in here */
2711
2712         drbd_set_defaults(mdev);
2713
2714         /* for now, we do NOT yet support it,
2715          * even though we start some framework
2716          * to eventually support barriers */
2717         set_bit(NO_BARRIER_SUPP, &mdev->flags);
2718
2719         atomic_set(&mdev->ap_bio_cnt, 0);
2720         atomic_set(&mdev->ap_pending_cnt, 0);
2721         atomic_set(&mdev->rs_pending_cnt, 0);
2722         atomic_set(&mdev->unacked_cnt, 0);
2723         atomic_set(&mdev->local_cnt, 0);
2724         atomic_set(&mdev->net_cnt, 0);
2725         atomic_set(&mdev->packet_seq, 0);
2726         atomic_set(&mdev->pp_in_use, 0);
2727
2728         mutex_init(&mdev->md_io_mutex);
2729         mutex_init(&mdev->data.mutex);
2730         mutex_init(&mdev->meta.mutex);
2731         sema_init(&mdev->data.work.s, 0);
2732         sema_init(&mdev->meta.work.s, 0);
2733         mutex_init(&mdev->state_mutex);
2734
2735         spin_lock_init(&mdev->data.work.q_lock);
2736         spin_lock_init(&mdev->meta.work.q_lock);
2737
2738         spin_lock_init(&mdev->al_lock);
2739         spin_lock_init(&mdev->req_lock);
2740         spin_lock_init(&mdev->peer_seq_lock);
2741         spin_lock_init(&mdev->epoch_lock);
2742
2743         INIT_LIST_HEAD(&mdev->active_ee);
2744         INIT_LIST_HEAD(&mdev->sync_ee);
2745         INIT_LIST_HEAD(&mdev->done_ee);
2746         INIT_LIST_HEAD(&mdev->read_ee);
2747         INIT_LIST_HEAD(&mdev->net_ee);
2748         INIT_LIST_HEAD(&mdev->resync_reads);
2749         INIT_LIST_HEAD(&mdev->data.work.q);
2750         INIT_LIST_HEAD(&mdev->meta.work.q);
2751         INIT_LIST_HEAD(&mdev->resync_work.list);
2752         INIT_LIST_HEAD(&mdev->unplug_work.list);
2753         INIT_LIST_HEAD(&mdev->md_sync_work.list);
2754         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2755
2756         mdev->resync_work.cb  = w_resync_inactive;
2757         mdev->unplug_work.cb  = w_send_write_hint;
2758         mdev->md_sync_work.cb = w_md_sync;
2759         mdev->bm_io_work.w.cb = w_bitmap_io;
2760         init_timer(&mdev->resync_timer);
2761         init_timer(&mdev->md_sync_timer);
2762         mdev->resync_timer.function = resync_timer_fn;
2763         mdev->resync_timer.data = (unsigned long) mdev;
2764         mdev->md_sync_timer.function = md_sync_timer_fn;
2765         mdev->md_sync_timer.data = (unsigned long) mdev;
2766
2767         init_waitqueue_head(&mdev->misc_wait);
2768         init_waitqueue_head(&mdev->state_wait);
2769         init_waitqueue_head(&mdev->net_cnt_wait);
2770         init_waitqueue_head(&mdev->ee_wait);
2771         init_waitqueue_head(&mdev->al_wait);
2772         init_waitqueue_head(&mdev->seq_wait);
2773
2774         drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2775         drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2776         drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2777
2778         mdev->agreed_pro_version = PRO_VERSION_MAX;
2779         mdev->write_ordering = WO_bio_barrier;
2780         mdev->resync_wenr = LC_FREE;
2781 }
2782
2783 void drbd_mdev_cleanup(struct drbd_conf *mdev)
2784 {
2785         if (mdev->receiver.t_state != None)
2786                 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2787                                 mdev->receiver.t_state);
2788
2789         /* no need to lock it, I'm the only thread alive */
2790         if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
2791                 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2792         mdev->al_writ_cnt  =
2793         mdev->bm_writ_cnt  =
2794         mdev->read_cnt     =
2795         mdev->recv_cnt     =
2796         mdev->send_cnt     =
2797         mdev->writ_cnt     =
2798         mdev->p_size       =
2799         mdev->rs_start     =
2800         mdev->rs_total     =
2801         mdev->rs_failed    =
2802         mdev->rs_mark_left =
2803         mdev->rs_mark_time = 0;
2804         D_ASSERT(mdev->net_conf == NULL);
2805
2806         drbd_set_my_capacity(mdev, 0);
2807         if (mdev->bitmap) {
2808                 /* maybe never allocated. */
2809                 drbd_bm_resize(mdev, 0, 1);
2810                 drbd_bm_cleanup(mdev);
2811         }
2812
2813         drbd_free_resources(mdev);
2814
2815         /*
2816          * currently we drbd_init_ee only on module load, so
2817          * we may do drbd_release_ee only on module unload!
2818          */
2819         D_ASSERT(list_empty(&mdev->active_ee));
2820         D_ASSERT(list_empty(&mdev->sync_ee));
2821         D_ASSERT(list_empty(&mdev->done_ee));
2822         D_ASSERT(list_empty(&mdev->read_ee));
2823         D_ASSERT(list_empty(&mdev->net_ee));
2824         D_ASSERT(list_empty(&mdev->resync_reads));
2825         D_ASSERT(list_empty(&mdev->data.work.q));
2826         D_ASSERT(list_empty(&mdev->meta.work.q));
2827         D_ASSERT(list_empty(&mdev->resync_work.list));
2828         D_ASSERT(list_empty(&mdev->unplug_work.list));
2829
2830 }
2831
2832
2833 static void drbd_destroy_mempools(void)
2834 {
2835         struct page *page;
2836
2837         while (drbd_pp_pool) {
2838                 page = drbd_pp_pool;
2839                 drbd_pp_pool = (struct page *)page_private(page);
2840                 __free_page(page);
2841                 drbd_pp_vacant--;
2842         }
2843
2844         /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2845
2846         if (drbd_ee_mempool)
2847                 mempool_destroy(drbd_ee_mempool);
2848         if (drbd_request_mempool)
2849                 mempool_destroy(drbd_request_mempool);
2850         if (drbd_ee_cache)
2851                 kmem_cache_destroy(drbd_ee_cache);
2852         if (drbd_request_cache)
2853                 kmem_cache_destroy(drbd_request_cache);
2854         if (drbd_bm_ext_cache)
2855                 kmem_cache_destroy(drbd_bm_ext_cache);
2856         if (drbd_al_ext_cache)
2857                 kmem_cache_destroy(drbd_al_ext_cache);
2858
2859         drbd_ee_mempool      = NULL;
2860         drbd_request_mempool = NULL;
2861         drbd_ee_cache        = NULL;
2862         drbd_request_cache   = NULL;
2863         drbd_bm_ext_cache    = NULL;
2864         drbd_al_ext_cache    = NULL;
2865
2866         return;
2867 }
2868
2869 static int drbd_create_mempools(void)
2870 {
2871         struct page *page;
2872         const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2873         int i;
2874
2875         /* prepare our caches and mempools */
2876         drbd_request_mempool = NULL;
2877         drbd_ee_cache        = NULL;
2878         drbd_request_cache   = NULL;
2879         drbd_bm_ext_cache    = NULL;
2880         drbd_al_ext_cache    = NULL;
2881         drbd_pp_pool         = NULL;
2882
2883         /* caches */
2884         drbd_request_cache = kmem_cache_create(
2885                 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2886         if (drbd_request_cache == NULL)
2887                 goto Enomem;
2888
2889         drbd_ee_cache = kmem_cache_create(
2890                 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2891         if (drbd_ee_cache == NULL)
2892                 goto Enomem;
2893
2894         drbd_bm_ext_cache = kmem_cache_create(
2895                 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2896         if (drbd_bm_ext_cache == NULL)
2897                 goto Enomem;
2898
2899         drbd_al_ext_cache = kmem_cache_create(
2900                 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2901         if (drbd_al_ext_cache == NULL)
2902                 goto Enomem;
2903
2904         /* mempools */
2905         drbd_request_mempool = mempool_create(number,
2906                 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2907         if (drbd_request_mempool == NULL)
2908                 goto Enomem;
2909
2910         drbd_ee_mempool = mempool_create(number,
2911                 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2912         if (drbd_request_mempool == NULL)
2913                 goto Enomem;
2914
2915         /* drbd's page pool */
2916         spin_lock_init(&drbd_pp_lock);
2917
2918         for (i = 0; i < number; i++) {
2919                 page = alloc_page(GFP_HIGHUSER);
2920                 if (!page)
2921                         goto Enomem;
2922                 set_page_private(page, (unsigned long)drbd_pp_pool);
2923                 drbd_pp_pool = page;
2924         }
2925         drbd_pp_vacant = number;
2926
2927         return 0;
2928
2929 Enomem:
2930         drbd_destroy_mempools(); /* in case we allocated some */
2931         return -ENOMEM;
2932 }
2933
2934 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2935         void *unused)
2936 {
2937         /* just so we have it.  you never know what interesting things we
2938          * might want to do here some day...
2939          */
2940
2941         return NOTIFY_DONE;
2942 }
2943
2944 static struct notifier_block drbd_notifier = {
2945         .notifier_call = drbd_notify_sys,
2946 };
2947
2948 static void drbd_release_ee_lists(struct drbd_conf *mdev)
2949 {
2950         int rr;
2951
2952         rr = drbd_release_ee(mdev, &mdev->active_ee);
2953         if (rr)
2954                 dev_err(DEV, "%d EEs in active list found!\n", rr);
2955
2956         rr = drbd_release_ee(mdev, &mdev->sync_ee);
2957         if (rr)
2958                 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2959
2960         rr = drbd_release_ee(mdev, &mdev->read_ee);
2961         if (rr)
2962                 dev_err(DEV, "%d EEs in read list found!\n", rr);
2963
2964         rr = drbd_release_ee(mdev, &mdev->done_ee);
2965         if (rr)
2966                 dev_err(DEV, "%d EEs in done list found!\n", rr);
2967
2968         rr = drbd_release_ee(mdev, &mdev->net_ee);
2969         if (rr)
2970                 dev_err(DEV, "%d EEs in net list found!\n", rr);
2971 }
2972
2973 /* caution. no locking.
2974  * currently only used from module cleanup code. */
2975 static void drbd_delete_device(unsigned int minor)
2976 {
2977         struct drbd_conf *mdev = minor_to_mdev(minor);
2978
2979         if (!mdev)
2980                 return;
2981
2982         /* paranoia asserts */
2983         if (mdev->open_cnt != 0)
2984                 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2985                                 __FILE__ , __LINE__);
2986
2987         ERR_IF (!list_empty(&mdev->data.work.q)) {
2988                 struct list_head *lp;
2989                 list_for_each(lp, &mdev->data.work.q) {
2990                         dev_err(DEV, "lp = %p\n", lp);
2991                 }
2992         };
2993         /* end paranoia asserts */
2994
2995         del_gendisk(mdev->vdisk);
2996
2997         /* cleanup stuff that may have been allocated during
2998          * device (re-)configuration or state changes */
2999
3000         if (mdev->this_bdev)
3001                 bdput(mdev->this_bdev);
3002
3003         drbd_free_resources(mdev);
3004
3005         drbd_release_ee_lists(mdev);
3006
3007         /* should be free'd on disconnect? */
3008         kfree(mdev->ee_hash);
3009         /*
3010         mdev->ee_hash_s = 0;
3011         mdev->ee_hash = NULL;
3012         */
3013
3014         lc_destroy(mdev->act_log);
3015         lc_destroy(mdev->resync);
3016
3017         kfree(mdev->p_uuid);
3018         /* mdev->p_uuid = NULL; */
3019
3020         kfree(mdev->int_dig_out);
3021         kfree(mdev->int_dig_in);
3022         kfree(mdev->int_dig_vv);
3023
3024         /* cleanup the rest that has been
3025          * allocated from drbd_new_device
3026          * and actually free the mdev itself */
3027         drbd_free_mdev(mdev);
3028 }
3029
3030 static void drbd_cleanup(void)
3031 {
3032         unsigned int i;
3033
3034         unregister_reboot_notifier(&drbd_notifier);
3035
3036         drbd_nl_cleanup();
3037
3038         if (minor_table) {
3039                 if (drbd_proc)
3040                         remove_proc_entry("drbd", NULL);
3041                 i = minor_count;
3042                 while (i--)
3043                         drbd_delete_device(i);
3044                 drbd_destroy_mempools();
3045         }
3046
3047         kfree(minor_table);
3048
3049         unregister_blkdev(DRBD_MAJOR, "drbd");
3050
3051         printk(KERN_INFO "drbd: module cleanup done.\n");
3052 }
3053
3054 /**
3055  * drbd_congested() - Callback for pdflush
3056  * @congested_data:     User data
3057  * @bdi_bits:           Bits pdflush is currently interested in
3058  *
3059  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3060  */
3061 static int drbd_congested(void *congested_data, int bdi_bits)
3062 {
3063         struct drbd_conf *mdev = congested_data;
3064         struct request_queue *q;
3065         char reason = '-';
3066         int r = 0;
3067
3068         if (!__inc_ap_bio_cond(mdev)) {
3069                 /* DRBD has frozen IO */
3070                 r = bdi_bits;
3071                 reason = 'd';
3072                 goto out;
3073         }
3074
3075         if (get_ldev(mdev)) {
3076                 q = bdev_get_queue(mdev->ldev->backing_bdev);
3077                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3078                 put_ldev(mdev);
3079                 if (r)
3080                         reason = 'b';
3081         }
3082
3083         if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3084                 r |= (1 << BDI_async_congested);
3085                 reason = reason == 'b' ? 'a' : 'n';
3086         }
3087
3088 out:
3089         mdev->congestion_reason = reason;
3090         return r;
3091 }
3092
3093 struct drbd_conf *drbd_new_device(unsigned int minor)
3094 {
3095         struct drbd_conf *mdev;
3096         struct gendisk *disk;
3097         struct request_queue *q;
3098
3099         /* GFP_KERNEL, we are outside of all write-out paths */
3100         mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3101         if (!mdev)
3102                 return NULL;
3103         if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3104                 goto out_no_cpumask;
3105
3106         mdev->minor = minor;
3107
3108         drbd_init_set_defaults(mdev);
3109
3110         q = blk_alloc_queue(GFP_KERNEL);
3111         if (!q)
3112                 goto out_no_q;
3113         mdev->rq_queue = q;
3114         q->queuedata   = mdev;
3115
3116         disk = alloc_disk(1);
3117         if (!disk)
3118                 goto out_no_disk;
3119         mdev->vdisk = disk;
3120
3121         set_disk_ro(disk, TRUE);
3122
3123         disk->queue = q;
3124         disk->major = DRBD_MAJOR;
3125         disk->first_minor = minor;
3126         disk->fops = &drbd_ops;
3127         sprintf(disk->disk_name, "drbd%d", minor);
3128         disk->private_data = mdev;
3129
3130         mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3131         /* we have no partitions. we contain only ourselves. */
3132         mdev->this_bdev->bd_contains = mdev->this_bdev;
3133
3134         q->backing_dev_info.congested_fn = drbd_congested;
3135         q->backing_dev_info.congested_data = mdev;
3136
3137         blk_queue_make_request(q, drbd_make_request_26);
3138         blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
3139         blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3140         blk_queue_merge_bvec(q, drbd_merge_bvec);
3141         q->queue_lock = &mdev->req_lock; /* needed since we use */
3142                 /* plugging on a queue, that actually has no requests! */
3143         q->unplug_fn = drbd_unplug_fn;
3144
3145         mdev->md_io_page = alloc_page(GFP_KERNEL);
3146         if (!mdev->md_io_page)
3147                 goto out_no_io_page;
3148
3149         if (drbd_bm_init(mdev))
3150                 goto out_no_bitmap;
3151         /* no need to lock access, we are still initializing this minor device. */
3152         if (!tl_init(mdev))
3153                 goto out_no_tl;
3154
3155         mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3156         if (!mdev->app_reads_hash)
3157                 goto out_no_app_reads;
3158
3159         mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3160         if (!mdev->current_epoch)
3161                 goto out_no_epoch;
3162
3163         INIT_LIST_HEAD(&mdev->current_epoch->list);
3164         mdev->epochs = 1;
3165
3166         return mdev;
3167
3168 /* out_whatever_else:
3169         kfree(mdev->current_epoch); */
3170 out_no_epoch:
3171         kfree(mdev->app_reads_hash);
3172 out_no_app_reads:
3173         tl_cleanup(mdev);
3174 out_no_tl:
3175         drbd_bm_cleanup(mdev);
3176 out_no_bitmap:
3177         __free_page(mdev->md_io_page);
3178 out_no_io_page:
3179         put_disk(disk);
3180 out_no_disk:
3181         blk_cleanup_queue(q);
3182 out_no_q:
3183         free_cpumask_var(mdev->cpu_mask);
3184 out_no_cpumask:
3185         kfree(mdev);
3186         return NULL;
3187 }
3188
3189 /* counterpart of drbd_new_device.
3190  * last part of drbd_delete_device. */
3191 void drbd_free_mdev(struct drbd_conf *mdev)
3192 {
3193         kfree(mdev->current_epoch);
3194         kfree(mdev->app_reads_hash);
3195         tl_cleanup(mdev);
3196         if (mdev->bitmap) /* should no longer be there. */
3197                 drbd_bm_cleanup(mdev);
3198         __free_page(mdev->md_io_page);
3199         put_disk(mdev->vdisk);
3200         blk_cleanup_queue(mdev->rq_queue);
3201         free_cpumask_var(mdev->cpu_mask);
3202         kfree(mdev);
3203 }
3204
3205
3206 int __init drbd_init(void)
3207 {
3208         int err;
3209
3210         if (sizeof(struct p_handshake) != 80) {
3211                 printk(KERN_ERR
3212                        "drbd: never change the size or layout "
3213                        "of the HandShake packet.\n");
3214                 return -EINVAL;
3215         }
3216
3217         if (1 > minor_count || minor_count > 255) {
3218                 printk(KERN_ERR
3219                         "drbd: invalid minor_count (%d)\n", minor_count);
3220 #ifdef MODULE
3221                 return -EINVAL;
3222 #else
3223                 minor_count = 8;
3224 #endif
3225         }
3226
3227         err = drbd_nl_init();
3228         if (err)
3229                 return err;
3230
3231         err = register_blkdev(DRBD_MAJOR, "drbd");
3232         if (err) {
3233                 printk(KERN_ERR
3234                        "drbd: unable to register block device major %d\n",
3235                        DRBD_MAJOR);
3236                 return err;
3237         }
3238
3239         register_reboot_notifier(&drbd_notifier);
3240
3241         /*
3242          * allocate all necessary structs
3243          */
3244         err = -ENOMEM;
3245
3246         init_waitqueue_head(&drbd_pp_wait);
3247
3248         drbd_proc = NULL; /* play safe for drbd_cleanup */
3249         minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3250                                 GFP_KERNEL);
3251         if (!minor_table)
3252                 goto Enomem;
3253
3254         err = drbd_create_mempools();
3255         if (err)
3256                 goto Enomem;
3257
3258         drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3259         if (!drbd_proc) {
3260                 printk(KERN_ERR "drbd: unable to register proc file\n");
3261                 goto Enomem;
3262         }
3263
3264         rwlock_init(&global_state_lock);
3265
3266         printk(KERN_INFO "drbd: initialized. "
3267                "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3268                API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3269         printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3270         printk(KERN_INFO "drbd: registered as block device major %d\n",
3271                 DRBD_MAJOR);
3272         printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3273
3274         return 0; /* Success! */
3275
3276 Enomem:
3277         drbd_cleanup();
3278         if (err == -ENOMEM)
3279                 /* currently always the case */
3280                 printk(KERN_ERR "drbd: ran out of memory\n");
3281         else
3282                 printk(KERN_ERR "drbd: initialization failure\n");
3283         return err;
3284 }
3285
3286 void drbd_free_bc(struct drbd_backing_dev *ldev)
3287 {
3288         if (ldev == NULL)
3289                 return;
3290
3291         bd_release(ldev->backing_bdev);
3292         bd_release(ldev->md_bdev);
3293
3294         fput(ldev->lo_file);
3295         fput(ldev->md_file);
3296
3297         kfree(ldev);
3298 }
3299
3300 void drbd_free_sock(struct drbd_conf *mdev)
3301 {
3302         if (mdev->data.socket) {
3303                 mutex_lock(&mdev->data.mutex);
3304                 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3305                 sock_release(mdev->data.socket);
3306                 mdev->data.socket = NULL;
3307                 mutex_unlock(&mdev->data.mutex);
3308         }
3309         if (mdev->meta.socket) {
3310                 mutex_lock(&mdev->meta.mutex);
3311                 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3312                 sock_release(mdev->meta.socket);
3313                 mdev->meta.socket = NULL;
3314                 mutex_unlock(&mdev->meta.mutex);
3315         }
3316 }
3317
3318
3319 void drbd_free_resources(struct drbd_conf *mdev)
3320 {
3321         crypto_free_hash(mdev->csums_tfm);
3322         mdev->csums_tfm = NULL;
3323         crypto_free_hash(mdev->verify_tfm);
3324         mdev->verify_tfm = NULL;
3325         crypto_free_hash(mdev->cram_hmac_tfm);
3326         mdev->cram_hmac_tfm = NULL;
3327         crypto_free_hash(mdev->integrity_w_tfm);
3328         mdev->integrity_w_tfm = NULL;
3329         crypto_free_hash(mdev->integrity_r_tfm);
3330         mdev->integrity_r_tfm = NULL;
3331
3332         drbd_free_sock(mdev);
3333
3334         __no_warn(local,
3335                   drbd_free_bc(mdev->ldev);
3336                   mdev->ldev = NULL;);
3337 }
3338
3339 /* meta data management */
3340
3341 struct meta_data_on_disk {
3342         u64 la_size;           /* last agreed size. */
3343         u64 uuid[UI_SIZE];   /* UUIDs. */
3344         u64 device_uuid;
3345         u64 reserved_u64_1;
3346         u32 flags;             /* MDF */
3347         u32 magic;
3348         u32 md_size_sect;
3349         u32 al_offset;         /* offset to this block */
3350         u32 al_nr_extents;     /* important for restoring the AL */
3351               /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3352         u32 bm_offset;         /* offset to the bitmap, from here */
3353         u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3354         u32 reserved_u32[4];
3355
3356 } __packed;
3357
3358 /**
3359  * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3360  * @mdev:       DRBD device.
3361  */
3362 void drbd_md_sync(struct drbd_conf *mdev)
3363 {
3364         struct meta_data_on_disk *buffer;
3365         sector_t sector;
3366         int i;
3367
3368         if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3369                 return;
3370         del_timer(&mdev->md_sync_timer);
3371
3372         /* We use here D_FAILED and not D_ATTACHING because we try to write
3373          * metadata even if we detach due to a disk failure! */
3374         if (!get_ldev_if_state(mdev, D_FAILED))
3375                 return;
3376
3377         mutex_lock(&mdev->md_io_mutex);
3378         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3379         memset(buffer, 0, 512);
3380
3381         buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3382         for (i = UI_CURRENT; i < UI_SIZE; i++)
3383                 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3384         buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3385         buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3386
3387         buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3388         buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3389         buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3390         buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3391         buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3392
3393         buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3394
3395         D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3396         sector = mdev->ldev->md.md_offset;
3397
3398         if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3399                 clear_bit(MD_DIRTY, &mdev->flags);
3400         } else {
3401                 /* this was a try anyways ... */
3402                 dev_err(DEV, "meta data update failed!\n");
3403
3404                 drbd_chk_io_error(mdev, 1, TRUE);
3405         }
3406
3407         /* Update mdev->ldev->md.la_size_sect,
3408          * since we updated it on metadata. */
3409         mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3410
3411         mutex_unlock(&mdev->md_io_mutex);
3412         put_ldev(mdev);
3413 }
3414
3415 /**
3416  * drbd_md_read() - Reads in the meta data super block
3417  * @mdev:       DRBD device.
3418  * @bdev:       Device from which the meta data should be read in.
3419  *
3420  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3421  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3422  */
3423 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3424 {
3425         struct meta_data_on_disk *buffer;
3426         int i, rv = NO_ERROR;
3427
3428         if (!get_ldev_if_state(mdev, D_ATTACHING))
3429                 return ERR_IO_MD_DISK;
3430
3431         mutex_lock(&mdev->md_io_mutex);
3432         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3433
3434         if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3435                 /* NOTE: cant do normal error processing here as this is
3436                    called BEFORE disk is attached */
3437                 dev_err(DEV, "Error while reading metadata.\n");
3438                 rv = ERR_IO_MD_DISK;
3439                 goto err;
3440         }
3441
3442         if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3443                 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3444                 rv = ERR_MD_INVALID;
3445                 goto err;
3446         }
3447         if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3448                 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3449                     be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3450                 rv = ERR_MD_INVALID;
3451                 goto err;
3452         }
3453         if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3454                 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3455                     be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3456                 rv = ERR_MD_INVALID;
3457                 goto err;
3458         }
3459         if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3460                 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3461                     be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3462                 rv = ERR_MD_INVALID;
3463                 goto err;
3464         }
3465
3466         if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3467                 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3468                     be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3469                 rv = ERR_MD_INVALID;
3470                 goto err;
3471         }
3472
3473         bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3474         for (i = UI_CURRENT; i < UI_SIZE; i++)
3475                 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3476         bdev->md.flags = be32_to_cpu(buffer->flags);
3477         mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3478         bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3479
3480         if (mdev->sync_conf.al_extents < 7)
3481                 mdev->sync_conf.al_extents = 127;
3482
3483  err:
3484         mutex_unlock(&mdev->md_io_mutex);
3485         put_ldev(mdev);
3486
3487         return rv;
3488 }
3489
3490 /**
3491  * drbd_md_mark_dirty() - Mark meta data super block as dirty
3492  * @mdev:       DRBD device.
3493  *
3494  * Call this function if you change anything that should be written to
3495  * the meta-data super block. This function sets MD_DIRTY, and starts a
3496  * timer that ensures that within five seconds you have to call drbd_md_sync().
3497  */
3498 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3499 {
3500         set_bit(MD_DIRTY, &mdev->flags);
3501         mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3502 }
3503
3504
3505 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3506 {
3507         int i;
3508
3509         for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3510                 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3511 }
3512
3513 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3514 {
3515         if (idx == UI_CURRENT) {
3516                 if (mdev->state.role == R_PRIMARY)
3517                         val |= 1;
3518                 else
3519                         val &= ~((u64)1);
3520
3521                 drbd_set_ed_uuid(mdev, val);
3522         }
3523
3524         mdev->ldev->md.uuid[idx] = val;
3525         drbd_md_mark_dirty(mdev);
3526 }
3527
3528
3529 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3530 {
3531         if (mdev->ldev->md.uuid[idx]) {
3532                 drbd_uuid_move_history(mdev);
3533                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3534         }
3535         _drbd_uuid_set(mdev, idx, val);
3536 }
3537
3538 /**
3539  * drbd_uuid_new_current() - Creates a new current UUID
3540  * @mdev:       DRBD device.
3541  *
3542  * Creates a new current UUID, and rotates the old current UUID into
3543  * the bitmap slot. Causes an incremental resync upon next connect.
3544  */
3545 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3546 {
3547         u64 val;
3548
3549         dev_info(DEV, "Creating new current UUID\n");
3550         D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3551         mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3552
3553         get_random_bytes(&val, sizeof(u64));
3554         _drbd_uuid_set(mdev, UI_CURRENT, val);
3555 }
3556
3557 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3558 {
3559         if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3560                 return;
3561
3562         if (val == 0) {
3563                 drbd_uuid_move_history(mdev);
3564                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3565                 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3566         } else {
3567                 if (mdev->ldev->md.uuid[UI_BITMAP])
3568                         dev_warn(DEV, "bm UUID already set");
3569
3570                 mdev->ldev->md.uuid[UI_BITMAP] = val;
3571                 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3572
3573         }
3574         drbd_md_mark_dirty(mdev);
3575 }
3576
3577 /**
3578  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3579  * @mdev:       DRBD device.
3580  *
3581  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3582  */
3583 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3584 {
3585         int rv = -EIO;
3586
3587         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3588                 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3589                 drbd_md_sync(mdev);
3590                 drbd_bm_set_all(mdev);
3591
3592                 rv = drbd_bm_write(mdev);
3593
3594                 if (!rv) {
3595                         drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3596                         drbd_md_sync(mdev);
3597                 }
3598
3599                 put_ldev(mdev);
3600         }
3601
3602         return rv;
3603 }
3604
3605 /**
3606  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3607  * @mdev:       DRBD device.
3608  *
3609  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3610  */
3611 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3612 {
3613         int rv = -EIO;
3614
3615         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3616                 drbd_bm_clear_all(mdev);
3617                 rv = drbd_bm_write(mdev);
3618                 put_ldev(mdev);
3619         }
3620
3621         return rv;
3622 }
3623
3624 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3625 {
3626         struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3627         int rv;
3628
3629         D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3630
3631         drbd_bm_lock(mdev, work->why);
3632         rv = work->io_fn(mdev);
3633         drbd_bm_unlock(mdev);
3634
3635         clear_bit(BITMAP_IO, &mdev->flags);
3636         wake_up(&mdev->misc_wait);
3637
3638         if (work->done)
3639                 work->done(mdev, rv);
3640
3641         clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3642         work->why = NULL;
3643
3644         return 1;
3645 }
3646
3647 /**
3648  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3649  * @mdev:       DRBD device.
3650  * @io_fn:      IO callback to be called when bitmap IO is possible
3651  * @done:       callback to be called after the bitmap IO was performed
3652  * @why:        Descriptive text of the reason for doing the IO
3653  *
3654  * While IO on the bitmap happens we freeze application IO thus we ensure
3655  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3656  * called from worker context. It MUST NOT be used while a previous such
3657  * work is still pending!
3658  */
3659 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3660                           int (*io_fn)(struct drbd_conf *),
3661                           void (*done)(struct drbd_conf *, int),
3662                           char *why)
3663 {
3664         D_ASSERT(current == mdev->worker.task);
3665
3666         D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3667         D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3668         D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3669         if (mdev->bm_io_work.why)
3670                 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3671                         why, mdev->bm_io_work.why);
3672
3673         mdev->bm_io_work.io_fn = io_fn;
3674         mdev->bm_io_work.done = done;
3675         mdev->bm_io_work.why = why;
3676
3677         set_bit(BITMAP_IO, &mdev->flags);
3678         if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3679                 if (list_empty(&mdev->bm_io_work.w.list)) {
3680                         set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3681                         drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3682                 } else
3683                         dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3684         }
3685 }
3686
3687 /**
3688  * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3689  * @mdev:       DRBD device.
3690  * @io_fn:      IO callback to be called when bitmap IO is possible
3691  * @why:        Descriptive text of the reason for doing the IO
3692  *
3693  * freezes application IO while that the actual IO operations runs. This
3694  * functions MAY NOT be called from worker context.
3695  */
3696 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3697 {
3698         int rv;
3699
3700         D_ASSERT(current != mdev->worker.task);
3701
3702         drbd_suspend_io(mdev);
3703
3704         drbd_bm_lock(mdev, why);
3705         rv = io_fn(mdev);
3706         drbd_bm_unlock(mdev);
3707
3708         drbd_resume_io(mdev);
3709
3710         return rv;
3711 }
3712
3713 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3714 {
3715         if ((mdev->ldev->md.flags & flag) != flag) {
3716                 drbd_md_mark_dirty(mdev);
3717                 mdev->ldev->md.flags |= flag;
3718         }
3719 }
3720
3721 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3722 {
3723         if ((mdev->ldev->md.flags & flag) != 0) {
3724                 drbd_md_mark_dirty(mdev);
3725                 mdev->ldev->md.flags &= ~flag;
3726         }
3727 }
3728 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3729 {
3730         return (bdev->md.flags & flag) != 0;
3731 }
3732
3733 static void md_sync_timer_fn(unsigned long data)
3734 {
3735         struct drbd_conf *mdev = (struct drbd_conf *) data;
3736
3737         drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3738 }
3739
3740 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3741 {
3742         dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3743         drbd_md_sync(mdev);
3744
3745         return 1;
3746 }
3747
3748 #ifdef CONFIG_DRBD_FAULT_INJECTION
3749 /* Fault insertion support including random number generator shamelessly
3750  * stolen from kernel/rcutorture.c */
3751 struct fault_random_state {
3752         unsigned long state;
3753         unsigned long count;
3754 };
3755
3756 #define FAULT_RANDOM_MULT 39916801  /* prime */
3757 #define FAULT_RANDOM_ADD        479001701 /* prime */
3758 #define FAULT_RANDOM_REFRESH 10000
3759
3760 /*
3761  * Crude but fast random-number generator.  Uses a linear congruential
3762  * generator, with occasional help from get_random_bytes().
3763  */
3764 static unsigned long
3765 _drbd_fault_random(struct fault_random_state *rsp)
3766 {
3767         long refresh;
3768
3769         if (!rsp->count--) {
3770                 get_random_bytes(&refresh, sizeof(refresh));
3771                 rsp->state += refresh;
3772                 rsp->count = FAULT_RANDOM_REFRESH;
3773         }
3774         rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3775         return swahw32(rsp->state);
3776 }
3777
3778 static char *
3779 _drbd_fault_str(unsigned int type) {
3780         static char *_faults[] = {
3781                 [DRBD_FAULT_MD_WR] = "Meta-data write",
3782                 [DRBD_FAULT_MD_RD] = "Meta-data read",
3783                 [DRBD_FAULT_RS_WR] = "Resync write",
3784                 [DRBD_FAULT_RS_RD] = "Resync read",
3785                 [DRBD_FAULT_DT_WR] = "Data write",
3786                 [DRBD_FAULT_DT_RD] = "Data read",
3787                 [DRBD_FAULT_DT_RA] = "Data read ahead",
3788                 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3789                 [DRBD_FAULT_AL_EE] = "EE allocation",
3790                 [DRBD_FAULT_RECEIVE] = "receive data corruption",
3791         };
3792
3793         return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3794 }
3795
3796 unsigned int
3797 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3798 {
3799         static struct fault_random_state rrs = {0, 0};
3800
3801         unsigned int ret = (
3802                 (fault_devs == 0 ||
3803                         ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3804                 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3805
3806         if (ret) {
3807                 fault_count++;
3808
3809                 if (__ratelimit(&drbd_ratelimit_state))
3810                         dev_warn(DEV, "***Simulating %s failure\n",
3811                                 _drbd_fault_str(type));
3812         }
3813
3814         return ret;
3815 }
3816 #endif
3817
3818 const char *drbd_buildtag(void)
3819 {
3820         /* DRBD built from external sources has here a reference to the
3821            git hash of the source code. */
3822
3823         static char buildtag[38] = "\0uilt-in";
3824
3825         if (buildtag[0] == 0) {
3826 #ifdef CONFIG_MODULES
3827                 if (THIS_MODULE != NULL)
3828                         sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3829                 else
3830 #endif
3831                         buildtag[0] = 'b';
3832         }
3833
3834         return buildtag;
3835 }
3836
3837 module_init(drbd_init)
3838 module_exit(drbd_cleanup)
3839
3840 EXPORT_SYMBOL(drbd_conn_str);
3841 EXPORT_SYMBOL(drbd_role_str);
3842 EXPORT_SYMBOL(drbd_disk_str);
3843 EXPORT_SYMBOL(drbd_set_st_err_str);