drbd: The new, smarter resync speed controller
[linux-2.6.git] / drivers / block / drbd / drbd_main.c
1 /*
2    drbd.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27  */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/smp_lock.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60         struct drbd_work w;
61         union drbd_state os;
62         union drbd_state ns;
63         enum chg_state_flags flags;
64         struct completion *done;
65 };
66
67 int drbdd_init(struct drbd_thread *);
68 int drbd_worker(struct drbd_thread *);
69 int drbd_asender(struct drbd_thread *);
70
71 int drbd_init(void);
72 static int drbd_open(struct block_device *bdev, fmode_t mode);
73 static int drbd_release(struct gendisk *gd, fmode_t mode);
74 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
75 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
76                            union drbd_state ns, enum chg_state_flags flags);
77 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
78 static void md_sync_timer_fn(unsigned long data);
79 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
80
81 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
82               "Lars Ellenberg <lars@linbit.com>");
83 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
84 MODULE_VERSION(REL_VERSION);
85 MODULE_LICENSE("GPL");
86 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
87 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
88
89 #include <linux/moduleparam.h>
90 /* allow_open_on_secondary */
91 MODULE_PARM_DESC(allow_oos, "DONT USE!");
92 /* thanks to these macros, if compiled into the kernel (not-module),
93  * this becomes the boot parameter drbd.minor_count */
94 module_param(minor_count, uint, 0444);
95 module_param(disable_sendpage, bool, 0644);
96 module_param(allow_oos, bool, 0);
97 module_param(cn_idx, uint, 0444);
98 module_param(proc_details, int, 0644);
99
100 #ifdef CONFIG_DRBD_FAULT_INJECTION
101 int enable_faults;
102 int fault_rate;
103 static int fault_count;
104 int fault_devs;
105 /* bitmap of enabled faults */
106 module_param(enable_faults, int, 0664);
107 /* fault rate % value - applies to all enabled faults */
108 module_param(fault_rate, int, 0664);
109 /* count of faults inserted */
110 module_param(fault_count, int, 0664);
111 /* bitmap of devices to insert faults on */
112 module_param(fault_devs, int, 0644);
113 #endif
114
115 /* module parameter, defined */
116 unsigned int minor_count = 32;
117 int disable_sendpage;
118 int allow_oos;
119 unsigned int cn_idx = CN_IDX_DRBD;
120 int proc_details;       /* Detail level in proc drbd*/
121
122 /* Module parameter for setting the user mode helper program
123  * to run. Default is /sbin/drbdadm */
124 char usermode_helper[80] = "/sbin/drbdadm";
125
126 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
127
128 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
129  * as member "struct gendisk *vdisk;"
130  */
131 struct drbd_conf **minor_table;
132
133 struct kmem_cache *drbd_request_cache;
134 struct kmem_cache *drbd_ee_cache;       /* epoch entries */
135 struct kmem_cache *drbd_bm_ext_cache;   /* bitmap extents */
136 struct kmem_cache *drbd_al_ext_cache;   /* activity log extents */
137 mempool_t *drbd_request_mempool;
138 mempool_t *drbd_ee_mempool;
139
140 /* I do not use a standard mempool, because:
141    1) I want to hand out the pre-allocated objects first.
142    2) I want to be able to interrupt sleeping allocation with a signal.
143    Note: This is a single linked list, the next pointer is the private
144          member of struct page.
145  */
146 struct page *drbd_pp_pool;
147 spinlock_t   drbd_pp_lock;
148 int          drbd_pp_vacant;
149 wait_queue_head_t drbd_pp_wait;
150
151 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
152
153 static const struct block_device_operations drbd_ops = {
154         .owner =   THIS_MODULE,
155         .open =    drbd_open,
156         .release = drbd_release,
157 };
158
159 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
160
161 #ifdef __CHECKER__
162 /* When checking with sparse, and this is an inline function, sparse will
163    give tons of false positives. When this is a real functions sparse works.
164  */
165 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
166 {
167         int io_allowed;
168
169         atomic_inc(&mdev->local_cnt);
170         io_allowed = (mdev->state.disk >= mins);
171         if (!io_allowed) {
172                 if (atomic_dec_and_test(&mdev->local_cnt))
173                         wake_up(&mdev->misc_wait);
174         }
175         return io_allowed;
176 }
177
178 #endif
179
180 /**
181  * DOC: The transfer log
182  *
183  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
184  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
185  * of the list. There is always at least one &struct drbd_tl_epoch object.
186  *
187  * Each &struct drbd_tl_epoch has a circular double linked list of requests
188  * attached.
189  */
190 static int tl_init(struct drbd_conf *mdev)
191 {
192         struct drbd_tl_epoch *b;
193
194         /* during device minor initialization, we may well use GFP_KERNEL */
195         b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
196         if (!b)
197                 return 0;
198         INIT_LIST_HEAD(&b->requests);
199         INIT_LIST_HEAD(&b->w.list);
200         b->next = NULL;
201         b->br_number = 4711;
202         b->n_writes = 0;
203         b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
204
205         mdev->oldest_tle = b;
206         mdev->newest_tle = b;
207         INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
208
209         mdev->tl_hash = NULL;
210         mdev->tl_hash_s = 0;
211
212         return 1;
213 }
214
215 static void tl_cleanup(struct drbd_conf *mdev)
216 {
217         D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
218         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
219         kfree(mdev->oldest_tle);
220         mdev->oldest_tle = NULL;
221         kfree(mdev->unused_spare_tle);
222         mdev->unused_spare_tle = NULL;
223         kfree(mdev->tl_hash);
224         mdev->tl_hash = NULL;
225         mdev->tl_hash_s = 0;
226 }
227
228 /**
229  * _tl_add_barrier() - Adds a barrier to the transfer log
230  * @mdev:       DRBD device.
231  * @new:        Barrier to be added before the current head of the TL.
232  *
233  * The caller must hold the req_lock.
234  */
235 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
236 {
237         struct drbd_tl_epoch *newest_before;
238
239         INIT_LIST_HEAD(&new->requests);
240         INIT_LIST_HEAD(&new->w.list);
241         new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
242         new->next = NULL;
243         new->n_writes = 0;
244
245         newest_before = mdev->newest_tle;
246         /* never send a barrier number == 0, because that is special-cased
247          * when using TCQ for our write ordering code */
248         new->br_number = (newest_before->br_number+1) ?: 1;
249         if (mdev->newest_tle != new) {
250                 mdev->newest_tle->next = new;
251                 mdev->newest_tle = new;
252         }
253 }
254
255 /**
256  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
257  * @mdev:       DRBD device.
258  * @barrier_nr: Expected identifier of the DRBD write barrier packet.
259  * @set_size:   Expected number of requests before that barrier.
260  *
261  * In case the passed barrier_nr or set_size does not match the oldest
262  * &struct drbd_tl_epoch objects this function will cause a termination
263  * of the connection.
264  */
265 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
266                        unsigned int set_size)
267 {
268         struct drbd_tl_epoch *b, *nob; /* next old barrier */
269         struct list_head *le, *tle;
270         struct drbd_request *r;
271
272         spin_lock_irq(&mdev->req_lock);
273
274         b = mdev->oldest_tle;
275
276         /* first some paranoia code */
277         if (b == NULL) {
278                 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
279                         barrier_nr);
280                 goto bail;
281         }
282         if (b->br_number != barrier_nr) {
283                 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
284                         barrier_nr, b->br_number);
285                 goto bail;
286         }
287         if (b->n_writes != set_size) {
288                 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
289                         barrier_nr, set_size, b->n_writes);
290                 goto bail;
291         }
292
293         /* Clean up list of requests processed during current epoch */
294         list_for_each_safe(le, tle, &b->requests) {
295                 r = list_entry(le, struct drbd_request, tl_requests);
296                 _req_mod(r, barrier_acked);
297         }
298         /* There could be requests on the list waiting for completion
299            of the write to the local disk. To avoid corruptions of
300            slab's data structures we have to remove the lists head.
301
302            Also there could have been a barrier ack out of sequence, overtaking
303            the write acks - which would be a bug and violating write ordering.
304            To not deadlock in case we lose connection while such requests are
305            still pending, we need some way to find them for the
306            _req_mode(connection_lost_while_pending).
307
308            These have been list_move'd to the out_of_sequence_requests list in
309            _req_mod(, barrier_acked) above.
310            */
311         list_del_init(&b->requests);
312
313         nob = b->next;
314         if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
315                 _tl_add_barrier(mdev, b);
316                 if (nob)
317                         mdev->oldest_tle = nob;
318                 /* if nob == NULL b was the only barrier, and becomes the new
319                    barrier. Therefore mdev->oldest_tle points already to b */
320         } else {
321                 D_ASSERT(nob != NULL);
322                 mdev->oldest_tle = nob;
323                 kfree(b);
324         }
325
326         spin_unlock_irq(&mdev->req_lock);
327         dec_ap_pending(mdev);
328
329         return;
330
331 bail:
332         spin_unlock_irq(&mdev->req_lock);
333         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
334 }
335
336 /**
337  * _tl_restart() - Walks the transfer log, and applies an action to all requests
338  * @mdev:       DRBD device.
339  * @what:       The action/event to perform with all request objects
340  *
341  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
342  * restart_frozen_disk_io.
343  */
344 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
345 {
346         struct drbd_tl_epoch *b, *tmp, **pn;
347         struct list_head *le, *tle, carry_reads;
348         struct drbd_request *req;
349         int rv, n_writes, n_reads;
350
351         b = mdev->oldest_tle;
352         pn = &mdev->oldest_tle;
353         while (b) {
354                 n_writes = 0;
355                 n_reads = 0;
356                 INIT_LIST_HEAD(&carry_reads);
357                 list_for_each_safe(le, tle, &b->requests) {
358                         req = list_entry(le, struct drbd_request, tl_requests);
359                         rv = _req_mod(req, what);
360
361                         n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
362                         n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
363                 }
364                 tmp = b->next;
365
366                 if (n_writes) {
367                         if (what == resend) {
368                                 b->n_writes = n_writes;
369                                 if (b->w.cb == NULL) {
370                                         b->w.cb = w_send_barrier;
371                                         inc_ap_pending(mdev);
372                                         set_bit(CREATE_BARRIER, &mdev->flags);
373                                 }
374
375                                 drbd_queue_work(&mdev->data.work, &b->w);
376                         }
377                         pn = &b->next;
378                 } else {
379                         if (n_reads)
380                                 list_add(&carry_reads, &b->requests);
381                         /* there could still be requests on that ring list,
382                          * in case local io is still pending */
383                         list_del(&b->requests);
384
385                         /* dec_ap_pending corresponding to queue_barrier.
386                          * the newest barrier may not have been queued yet,
387                          * in which case w.cb is still NULL. */
388                         if (b->w.cb != NULL)
389                                 dec_ap_pending(mdev);
390
391                         if (b == mdev->newest_tle) {
392                                 /* recycle, but reinit! */
393                                 D_ASSERT(tmp == NULL);
394                                 INIT_LIST_HEAD(&b->requests);
395                                 list_splice(&carry_reads, &b->requests);
396                                 INIT_LIST_HEAD(&b->w.list);
397                                 b->w.cb = NULL;
398                                 b->br_number = net_random();
399                                 b->n_writes = 0;
400
401                                 *pn = b;
402                                 break;
403                         }
404                         *pn = tmp;
405                         kfree(b);
406                 }
407                 b = tmp;
408                 list_splice(&carry_reads, &b->requests);
409         }
410 }
411
412
413 /**
414  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
415  * @mdev:       DRBD device.
416  *
417  * This is called after the connection to the peer was lost. The storage covered
418  * by the requests on the transfer gets marked as our of sync. Called from the
419  * receiver thread and the worker thread.
420  */
421 void tl_clear(struct drbd_conf *mdev)
422 {
423         struct list_head *le, *tle;
424         struct drbd_request *r;
425
426         spin_lock_irq(&mdev->req_lock);
427
428         _tl_restart(mdev, connection_lost_while_pending);
429
430         /* we expect this list to be empty. */
431         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
432
433         /* but just in case, clean it up anyways! */
434         list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
435                 r = list_entry(le, struct drbd_request, tl_requests);
436                 /* It would be nice to complete outside of spinlock.
437                  * But this is easier for now. */
438                 _req_mod(r, connection_lost_while_pending);
439         }
440
441         /* ensure bit indicating barrier is required is clear */
442         clear_bit(CREATE_BARRIER, &mdev->flags);
443
444         memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
445
446         spin_unlock_irq(&mdev->req_lock);
447 }
448
449 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
450 {
451         spin_lock_irq(&mdev->req_lock);
452         _tl_restart(mdev, what);
453         spin_unlock_irq(&mdev->req_lock);
454 }
455
456 /**
457  * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
458  * @mdev:       DRBD device.
459  * @os:         old (current) state.
460  * @ns:         new (wanted) state.
461  */
462 static int cl_wide_st_chg(struct drbd_conf *mdev,
463                           union drbd_state os, union drbd_state ns)
464 {
465         return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
466                  ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
467                   (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
468                   (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
469                   (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
470                 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
471                 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
472 }
473
474 int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
475                       union drbd_state mask, union drbd_state val)
476 {
477         unsigned long flags;
478         union drbd_state os, ns;
479         int rv;
480
481         spin_lock_irqsave(&mdev->req_lock, flags);
482         os = mdev->state;
483         ns.i = (os.i & ~mask.i) | val.i;
484         rv = _drbd_set_state(mdev, ns, f, NULL);
485         ns = mdev->state;
486         spin_unlock_irqrestore(&mdev->req_lock, flags);
487
488         return rv;
489 }
490
491 /**
492  * drbd_force_state() - Impose a change which happens outside our control on our state
493  * @mdev:       DRBD device.
494  * @mask:       mask of state bits to change.
495  * @val:        value of new state bits.
496  */
497 void drbd_force_state(struct drbd_conf *mdev,
498         union drbd_state mask, union drbd_state val)
499 {
500         drbd_change_state(mdev, CS_HARD, mask, val);
501 }
502
503 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
504 static int is_valid_state_transition(struct drbd_conf *,
505                                      union drbd_state, union drbd_state);
506 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
507                                        union drbd_state ns, int *warn_sync_abort);
508 int drbd_send_state_req(struct drbd_conf *,
509                         union drbd_state, union drbd_state);
510
511 static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
512                                     union drbd_state mask, union drbd_state val)
513 {
514         union drbd_state os, ns;
515         unsigned long flags;
516         int rv;
517
518         if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
519                 return SS_CW_SUCCESS;
520
521         if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
522                 return SS_CW_FAILED_BY_PEER;
523
524         rv = 0;
525         spin_lock_irqsave(&mdev->req_lock, flags);
526         os = mdev->state;
527         ns.i = (os.i & ~mask.i) | val.i;
528         ns = sanitize_state(mdev, os, ns, NULL);
529
530         if (!cl_wide_st_chg(mdev, os, ns))
531                 rv = SS_CW_NO_NEED;
532         if (!rv) {
533                 rv = is_valid_state(mdev, ns);
534                 if (rv == SS_SUCCESS) {
535                         rv = is_valid_state_transition(mdev, ns, os);
536                         if (rv == SS_SUCCESS)
537                                 rv = 0; /* cont waiting, otherwise fail. */
538                 }
539         }
540         spin_unlock_irqrestore(&mdev->req_lock, flags);
541
542         return rv;
543 }
544
545 /**
546  * drbd_req_state() - Perform an eventually cluster wide state change
547  * @mdev:       DRBD device.
548  * @mask:       mask of state bits to change.
549  * @val:        value of new state bits.
550  * @f:          flags
551  *
552  * Should not be called directly, use drbd_request_state() or
553  * _drbd_request_state().
554  */
555 static int drbd_req_state(struct drbd_conf *mdev,
556                           union drbd_state mask, union drbd_state val,
557                           enum chg_state_flags f)
558 {
559         struct completion done;
560         unsigned long flags;
561         union drbd_state os, ns;
562         int rv;
563
564         init_completion(&done);
565
566         if (f & CS_SERIALIZE)
567                 mutex_lock(&mdev->state_mutex);
568
569         spin_lock_irqsave(&mdev->req_lock, flags);
570         os = mdev->state;
571         ns.i = (os.i & ~mask.i) | val.i;
572         ns = sanitize_state(mdev, os, ns, NULL);
573
574         if (cl_wide_st_chg(mdev, os, ns)) {
575                 rv = is_valid_state(mdev, ns);
576                 if (rv == SS_SUCCESS)
577                         rv = is_valid_state_transition(mdev, ns, os);
578                 spin_unlock_irqrestore(&mdev->req_lock, flags);
579
580                 if (rv < SS_SUCCESS) {
581                         if (f & CS_VERBOSE)
582                                 print_st_err(mdev, os, ns, rv);
583                         goto abort;
584                 }
585
586                 drbd_state_lock(mdev);
587                 if (!drbd_send_state_req(mdev, mask, val)) {
588                         drbd_state_unlock(mdev);
589                         rv = SS_CW_FAILED_BY_PEER;
590                         if (f & CS_VERBOSE)
591                                 print_st_err(mdev, os, ns, rv);
592                         goto abort;
593                 }
594
595                 wait_event(mdev->state_wait,
596                         (rv = _req_st_cond(mdev, mask, val)));
597
598                 if (rv < SS_SUCCESS) {
599                         drbd_state_unlock(mdev);
600                         if (f & CS_VERBOSE)
601                                 print_st_err(mdev, os, ns, rv);
602                         goto abort;
603                 }
604                 spin_lock_irqsave(&mdev->req_lock, flags);
605                 os = mdev->state;
606                 ns.i = (os.i & ~mask.i) | val.i;
607                 rv = _drbd_set_state(mdev, ns, f, &done);
608                 drbd_state_unlock(mdev);
609         } else {
610                 rv = _drbd_set_state(mdev, ns, f, &done);
611         }
612
613         spin_unlock_irqrestore(&mdev->req_lock, flags);
614
615         if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
616                 D_ASSERT(current != mdev->worker.task);
617                 wait_for_completion(&done);
618         }
619
620 abort:
621         if (f & CS_SERIALIZE)
622                 mutex_unlock(&mdev->state_mutex);
623
624         return rv;
625 }
626
627 /**
628  * _drbd_request_state() - Request a state change (with flags)
629  * @mdev:       DRBD device.
630  * @mask:       mask of state bits to change.
631  * @val:        value of new state bits.
632  * @f:          flags
633  *
634  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
635  * flag, or when logging of failed state change requests is not desired.
636  */
637 int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
638                         union drbd_state val,   enum chg_state_flags f)
639 {
640         int rv;
641
642         wait_event(mdev->state_wait,
643                    (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
644
645         return rv;
646 }
647
648 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
649 {
650         dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
651             name,
652             drbd_conn_str(ns.conn),
653             drbd_role_str(ns.role),
654             drbd_role_str(ns.peer),
655             drbd_disk_str(ns.disk),
656             drbd_disk_str(ns.pdsk),
657             ns.susp ? 's' : 'r',
658             ns.aftr_isp ? 'a' : '-',
659             ns.peer_isp ? 'p' : '-',
660             ns.user_isp ? 'u' : '-'
661             );
662 }
663
664 void print_st_err(struct drbd_conf *mdev,
665         union drbd_state os, union drbd_state ns, int err)
666 {
667         if (err == SS_IN_TRANSIENT_STATE)
668                 return;
669         dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
670         print_st(mdev, " state", os);
671         print_st(mdev, "wanted", ns);
672 }
673
674
675 #define drbd_peer_str drbd_role_str
676 #define drbd_pdsk_str drbd_disk_str
677
678 #define drbd_susp_str(A)     ((A) ? "1" : "0")
679 #define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
680 #define drbd_peer_isp_str(A) ((A) ? "1" : "0")
681 #define drbd_user_isp_str(A) ((A) ? "1" : "0")
682
683 #define PSC(A) \
684         ({ if (ns.A != os.A) { \
685                 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
686                               drbd_##A##_str(os.A), \
687                               drbd_##A##_str(ns.A)); \
688         } })
689
690 /**
691  * is_valid_state() - Returns an SS_ error code if ns is not valid
692  * @mdev:       DRBD device.
693  * @ns:         State to consider.
694  */
695 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
696 {
697         /* See drbd_state_sw_errors in drbd_strings.c */
698
699         enum drbd_fencing_p fp;
700         int rv = SS_SUCCESS;
701
702         fp = FP_DONT_CARE;
703         if (get_ldev(mdev)) {
704                 fp = mdev->ldev->dc.fencing;
705                 put_ldev(mdev);
706         }
707
708         if (get_net_conf(mdev)) {
709                 if (!mdev->net_conf->two_primaries &&
710                     ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
711                         rv = SS_TWO_PRIMARIES;
712                 put_net_conf(mdev);
713         }
714
715         if (rv <= 0)
716                 /* already found a reason to abort */;
717         else if (ns.role == R_SECONDARY && mdev->open_cnt)
718                 rv = SS_DEVICE_IN_USE;
719
720         else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
721                 rv = SS_NO_UP_TO_DATE_DISK;
722
723         else if (fp >= FP_RESOURCE &&
724                  ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
725                 rv = SS_PRIMARY_NOP;
726
727         else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
728                 rv = SS_NO_UP_TO_DATE_DISK;
729
730         else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
731                 rv = SS_NO_LOCAL_DISK;
732
733         else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
734                 rv = SS_NO_REMOTE_DISK;
735
736         else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
737                 rv = SS_NO_UP_TO_DATE_DISK;
738
739         else if ((ns.conn == C_CONNECTED ||
740                   ns.conn == C_WF_BITMAP_S ||
741                   ns.conn == C_SYNC_SOURCE ||
742                   ns.conn == C_PAUSED_SYNC_S) &&
743                   ns.disk == D_OUTDATED)
744                 rv = SS_CONNECTED_OUTDATES;
745
746         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
747                  (mdev->sync_conf.verify_alg[0] == 0))
748                 rv = SS_NO_VERIFY_ALG;
749
750         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
751                   mdev->agreed_pro_version < 88)
752                 rv = SS_NOT_SUPPORTED;
753
754         return rv;
755 }
756
757 /**
758  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
759  * @mdev:       DRBD device.
760  * @ns:         new state.
761  * @os:         old state.
762  */
763 static int is_valid_state_transition(struct drbd_conf *mdev,
764                                      union drbd_state ns, union drbd_state os)
765 {
766         int rv = SS_SUCCESS;
767
768         if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
769             os.conn > C_CONNECTED)
770                 rv = SS_RESYNC_RUNNING;
771
772         if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
773                 rv = SS_ALREADY_STANDALONE;
774
775         if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
776                 rv = SS_IS_DISKLESS;
777
778         if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
779                 rv = SS_NO_NET_CONFIG;
780
781         if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
782                 rv = SS_LOWER_THAN_OUTDATED;
783
784         if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
785                 rv = SS_IN_TRANSIENT_STATE;
786
787         if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
788                 rv = SS_IN_TRANSIENT_STATE;
789
790         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
791                 rv = SS_NEED_CONNECTION;
792
793         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
794             ns.conn != os.conn && os.conn > C_CONNECTED)
795                 rv = SS_RESYNC_RUNNING;
796
797         if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
798             os.conn < C_CONNECTED)
799                 rv = SS_NEED_CONNECTION;
800
801         return rv;
802 }
803
804 /**
805  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
806  * @mdev:       DRBD device.
807  * @os:         old state.
808  * @ns:         new state.
809  * @warn_sync_abort:
810  *
811  * When we loose connection, we have to set the state of the peers disk (pdsk)
812  * to D_UNKNOWN. This rule and many more along those lines are in this function.
813  */
814 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
815                                        union drbd_state ns, int *warn_sync_abort)
816 {
817         enum drbd_fencing_p fp;
818
819         fp = FP_DONT_CARE;
820         if (get_ldev(mdev)) {
821                 fp = mdev->ldev->dc.fencing;
822                 put_ldev(mdev);
823         }
824
825         /* Disallow Network errors to configure a device's network part */
826         if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
827             os.conn <= C_DISCONNECTING)
828                 ns.conn = os.conn;
829
830         /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
831         if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
832             ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
833                 ns.conn = os.conn;
834
835         /* After C_DISCONNECTING only C_STANDALONE may follow */
836         if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
837                 ns.conn = os.conn;
838
839         if (ns.conn < C_CONNECTED) {
840                 ns.peer_isp = 0;
841                 ns.peer = R_UNKNOWN;
842                 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
843                         ns.pdsk = D_UNKNOWN;
844         }
845
846         /* Clear the aftr_isp when becoming unconfigured */
847         if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
848                 ns.aftr_isp = 0;
849
850         /* Abort resync if a disk fails/detaches */
851         if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
852             (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
853                 if (warn_sync_abort)
854                         *warn_sync_abort = 1;
855                 ns.conn = C_CONNECTED;
856         }
857
858         if (ns.conn >= C_CONNECTED &&
859             ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) ||
860              (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) {
861                 switch (ns.conn) {
862                 case C_WF_BITMAP_T:
863                 case C_PAUSED_SYNC_T:
864                         ns.disk = D_OUTDATED;
865                         break;
866                 case C_CONNECTED:
867                 case C_WF_BITMAP_S:
868                 case C_SYNC_SOURCE:
869                 case C_PAUSED_SYNC_S:
870                         ns.disk = D_UP_TO_DATE;
871                         break;
872                 case C_SYNC_TARGET:
873                         ns.disk = D_INCONSISTENT;
874                         dev_warn(DEV, "Implicitly set disk state Inconsistent!\n");
875                         break;
876                 }
877                 if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE)
878                         dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n");
879         }
880
881         if (ns.conn >= C_CONNECTED &&
882             (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) {
883                 switch (ns.conn) {
884                 case C_CONNECTED:
885                 case C_WF_BITMAP_T:
886                 case C_PAUSED_SYNC_T:
887                 case C_SYNC_TARGET:
888                         ns.pdsk = D_UP_TO_DATE;
889                         break;
890                 case C_WF_BITMAP_S:
891                 case C_PAUSED_SYNC_S:
892                         /* remap any consistent state to D_OUTDATED,
893                          * but disallow "upgrade" of not even consistent states.
894                          */
895                         ns.pdsk =
896                                 (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
897                                 ? os.pdsk : D_OUTDATED;
898                         break;
899                 case C_SYNC_SOURCE:
900                         ns.pdsk = D_INCONSISTENT;
901                         dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n");
902                         break;
903                 }
904                 if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE)
905                         dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n");
906         }
907
908         /* Connection breaks down before we finished "Negotiating" */
909         if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
910             get_ldev_if_state(mdev, D_NEGOTIATING)) {
911                 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
912                         ns.disk = mdev->new_state_tmp.disk;
913                         ns.pdsk = mdev->new_state_tmp.pdsk;
914                 } else {
915                         dev_alert(DEV, "Connection lost while negotiating, no data!\n");
916                         ns.disk = D_DISKLESS;
917                         ns.pdsk = D_UNKNOWN;
918                 }
919                 put_ldev(mdev);
920         }
921
922         if (fp == FP_STONITH &&
923             (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
924             !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
925                 ns.susp = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
926
927         if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
928             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
929             !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
930                 ns.susp = 1; /* Suspend IO while no data available (no accessible data available) */
931
932         if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
933                 if (ns.conn == C_SYNC_SOURCE)
934                         ns.conn = C_PAUSED_SYNC_S;
935                 if (ns.conn == C_SYNC_TARGET)
936                         ns.conn = C_PAUSED_SYNC_T;
937         } else {
938                 if (ns.conn == C_PAUSED_SYNC_S)
939                         ns.conn = C_SYNC_SOURCE;
940                 if (ns.conn == C_PAUSED_SYNC_T)
941                         ns.conn = C_SYNC_TARGET;
942         }
943
944         return ns;
945 }
946
947 /* helper for __drbd_set_state */
948 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
949 {
950         if (cs == C_VERIFY_T) {
951                 /* starting online verify from an arbitrary position
952                  * does not fit well into the existing protocol.
953                  * on C_VERIFY_T, we initialize ov_left and friends
954                  * implicitly in receive_DataRequest once the
955                  * first P_OV_REQUEST is received */
956                 mdev->ov_start_sector = ~(sector_t)0;
957         } else {
958                 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
959                 if (bit >= mdev->rs_total)
960                         mdev->ov_start_sector =
961                                 BM_BIT_TO_SECT(mdev->rs_total - 1);
962                 mdev->ov_position = mdev->ov_start_sector;
963         }
964 }
965
966 /**
967  * __drbd_set_state() - Set a new DRBD state
968  * @mdev:       DRBD device.
969  * @ns:         new state.
970  * @flags:      Flags
971  * @done:       Optional completion, that will get completed after the after_state_ch() finished
972  *
973  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
974  */
975 int __drbd_set_state(struct drbd_conf *mdev,
976                     union drbd_state ns, enum chg_state_flags flags,
977                     struct completion *done)
978 {
979         union drbd_state os;
980         int rv = SS_SUCCESS;
981         int warn_sync_abort = 0;
982         struct after_state_chg_work *ascw;
983
984         os = mdev->state;
985
986         ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
987
988         if (ns.i == os.i)
989                 return SS_NOTHING_TO_DO;
990
991         if (!(flags & CS_HARD)) {
992                 /*  pre-state-change checks ; only look at ns  */
993                 /* See drbd_state_sw_errors in drbd_strings.c */
994
995                 rv = is_valid_state(mdev, ns);
996                 if (rv < SS_SUCCESS) {
997                         /* If the old state was illegal as well, then let
998                            this happen...*/
999
1000                         if (is_valid_state(mdev, os) == rv)
1001                                 rv = is_valid_state_transition(mdev, ns, os);
1002                 } else
1003                         rv = is_valid_state_transition(mdev, ns, os);
1004         }
1005
1006         if (rv < SS_SUCCESS) {
1007                 if (flags & CS_VERBOSE)
1008                         print_st_err(mdev, os, ns, rv);
1009                 return rv;
1010         }
1011
1012         if (warn_sync_abort)
1013                 dev_warn(DEV, "Resync aborted.\n");
1014
1015         {
1016                 char *pbp, pb[300];
1017                 pbp = pb;
1018                 *pbp = 0;
1019                 PSC(role);
1020                 PSC(peer);
1021                 PSC(conn);
1022                 PSC(disk);
1023                 PSC(pdsk);
1024                 PSC(susp);
1025                 PSC(aftr_isp);
1026                 PSC(peer_isp);
1027                 PSC(user_isp);
1028                 dev_info(DEV, "%s\n", pb);
1029         }
1030
1031         /* solve the race between becoming unconfigured,
1032          * worker doing the cleanup, and
1033          * admin reconfiguring us:
1034          * on (re)configure, first set CONFIG_PENDING,
1035          * then wait for a potentially exiting worker,
1036          * start the worker, and schedule one no_op.
1037          * then proceed with configuration.
1038          */
1039         if (ns.disk == D_DISKLESS &&
1040             ns.conn == C_STANDALONE &&
1041             ns.role == R_SECONDARY &&
1042             !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1043                 set_bit(DEVICE_DYING, &mdev->flags);
1044
1045         mdev->state.i = ns.i;
1046         wake_up(&mdev->misc_wait);
1047         wake_up(&mdev->state_wait);
1048
1049         /*   post-state-change actions   */
1050         if (os.conn >= C_SYNC_SOURCE   && ns.conn <= C_CONNECTED) {
1051                 set_bit(STOP_SYNC_TIMER, &mdev->flags);
1052                 mod_timer(&mdev->resync_timer, jiffies);
1053         }
1054
1055         /* aborted verify run. log the last position */
1056         if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1057             ns.conn < C_CONNECTED) {
1058                 mdev->ov_start_sector =
1059                         BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left);
1060                 dev_info(DEV, "Online Verify reached sector %llu\n",
1061                         (unsigned long long)mdev->ov_start_sector);
1062         }
1063
1064         if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1065             (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1066                 dev_info(DEV, "Syncer continues.\n");
1067                 mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
1068                 if (ns.conn == C_SYNC_TARGET) {
1069                         if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
1070                                 mod_timer(&mdev->resync_timer, jiffies);
1071                         /* This if (!test_bit) is only needed for the case
1072                            that a device that has ceased to used its timer,
1073                            i.e. it is already in drbd_resync_finished() gets
1074                            paused and resumed. */
1075                 }
1076         }
1077
1078         if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1079             (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1080                 dev_info(DEV, "Resync suspended\n");
1081                 mdev->rs_mark_time = jiffies;
1082                 if (ns.conn == C_PAUSED_SYNC_T)
1083                         set_bit(STOP_SYNC_TIMER, &mdev->flags);
1084         }
1085
1086         if (os.conn == C_CONNECTED &&
1087             (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1088                 mdev->ov_position = 0;
1089                 mdev->rs_total =
1090                 mdev->rs_mark_left = drbd_bm_bits(mdev);
1091                 if (mdev->agreed_pro_version >= 90)
1092                         set_ov_position(mdev, ns.conn);
1093                 else
1094                         mdev->ov_start_sector = 0;
1095                 mdev->ov_left = mdev->rs_total
1096                               - BM_SECT_TO_BIT(mdev->ov_position);
1097                 mdev->rs_start     =
1098                 mdev->rs_mark_time = jiffies;
1099                 mdev->ov_last_oos_size = 0;
1100                 mdev->ov_last_oos_start = 0;
1101
1102                 if (ns.conn == C_VERIFY_S) {
1103                         dev_info(DEV, "Starting Online Verify from sector %llu\n",
1104                                         (unsigned long long)mdev->ov_position);
1105                         mod_timer(&mdev->resync_timer, jiffies);
1106                 }
1107         }
1108
1109         if (get_ldev(mdev)) {
1110                 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1111                                                  MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1112                                                  MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1113
1114                 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1115                         mdf |= MDF_CRASHED_PRIMARY;
1116                 if (mdev->state.role == R_PRIMARY ||
1117                     (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1118                         mdf |= MDF_PRIMARY_IND;
1119                 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1120                         mdf |= MDF_CONNECTED_IND;
1121                 if (mdev->state.disk > D_INCONSISTENT)
1122                         mdf |= MDF_CONSISTENT;
1123                 if (mdev->state.disk > D_OUTDATED)
1124                         mdf |= MDF_WAS_UP_TO_DATE;
1125                 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1126                         mdf |= MDF_PEER_OUT_DATED;
1127                 if (mdf != mdev->ldev->md.flags) {
1128                         mdev->ldev->md.flags = mdf;
1129                         drbd_md_mark_dirty(mdev);
1130                 }
1131                 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1132                         drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1133                 put_ldev(mdev);
1134         }
1135
1136         /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1137         if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1138             os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1139                 set_bit(CONSIDER_RESYNC, &mdev->flags);
1140
1141         /* Receiver should clean up itself */
1142         if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1143                 drbd_thread_stop_nowait(&mdev->receiver);
1144
1145         /* Now the receiver finished cleaning up itself, it should die */
1146         if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1147                 drbd_thread_stop_nowait(&mdev->receiver);
1148
1149         /* Upon network failure, we need to restart the receiver. */
1150         if (os.conn > C_TEAR_DOWN &&
1151             ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1152                 drbd_thread_restart_nowait(&mdev->receiver);
1153
1154         ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1155         if (ascw) {
1156                 ascw->os = os;
1157                 ascw->ns = ns;
1158                 ascw->flags = flags;
1159                 ascw->w.cb = w_after_state_ch;
1160                 ascw->done = done;
1161                 drbd_queue_work(&mdev->data.work, &ascw->w);
1162         } else {
1163                 dev_warn(DEV, "Could not kmalloc an ascw\n");
1164         }
1165
1166         return rv;
1167 }
1168
1169 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1170 {
1171         struct after_state_chg_work *ascw =
1172                 container_of(w, struct after_state_chg_work, w);
1173         after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1174         if (ascw->flags & CS_WAIT_COMPLETE) {
1175                 D_ASSERT(ascw->done != NULL);
1176                 complete(ascw->done);
1177         }
1178         kfree(ascw);
1179
1180         return 1;
1181 }
1182
1183 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1184 {
1185         if (rv) {
1186                 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1187                 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1188                 return;
1189         }
1190
1191         switch (mdev->state.conn) {
1192         case C_STARTING_SYNC_T:
1193                 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1194                 break;
1195         case C_STARTING_SYNC_S:
1196                 drbd_start_resync(mdev, C_SYNC_SOURCE);
1197                 break;
1198         }
1199 }
1200
1201 /**
1202  * after_state_ch() - Perform after state change actions that may sleep
1203  * @mdev:       DRBD device.
1204  * @os:         old state.
1205  * @ns:         new state.
1206  * @flags:      Flags
1207  */
1208 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1209                            union drbd_state ns, enum chg_state_flags flags)
1210 {
1211         enum drbd_fencing_p fp;
1212         enum drbd_req_event what = nothing;
1213
1214         if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1215                 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1216                 if (mdev->p_uuid)
1217                         mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1218         }
1219
1220         fp = FP_DONT_CARE;
1221         if (get_ldev(mdev)) {
1222                 fp = mdev->ldev->dc.fencing;
1223                 put_ldev(mdev);
1224         }
1225
1226         /* Inform userspace about the change... */
1227         drbd_bcast_state(mdev, ns);
1228
1229         if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1230             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1231                 drbd_khelper(mdev, "pri-on-incon-degr");
1232
1233         /* Here we have the actions that are performed after a
1234            state change. This function might sleep */
1235
1236         if (os.susp && ns.susp && mdev->sync_conf.on_no_data == OND_SUSPEND_IO) {
1237                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1238                         if (ns.conn == C_CONNECTED)
1239                                 what = resend;
1240                         else /* ns.conn > C_CONNECTED */
1241                                 dev_err(DEV, "Unexpected Resynd going on!\n");
1242                 }
1243
1244                 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1245                         what = restart_frozen_disk_io;
1246         }
1247
1248         if (fp == FP_STONITH && ns.susp) {
1249                 /* case1: The outdate peer handler is successful: */
1250                 if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1251                         tl_clear(mdev);
1252                         if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1253                                 drbd_uuid_new_current(mdev);
1254                                 clear_bit(NEW_CUR_UUID, &mdev->flags);
1255                                 drbd_md_sync(mdev);
1256                         }
1257                         spin_lock_irq(&mdev->req_lock);
1258                         _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1259                         spin_unlock_irq(&mdev->req_lock);
1260                 }
1261                 /* case2: The connection was established again: */
1262                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1263                         clear_bit(NEW_CUR_UUID, &mdev->flags);
1264                         what = resend;
1265                 }
1266         }
1267
1268         if (what != nothing) {
1269                 spin_lock_irq(&mdev->req_lock);
1270                 _tl_restart(mdev, what);
1271                 _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
1272                 spin_unlock_irq(&mdev->req_lock);
1273         }
1274
1275         /* Do not change the order of the if above and the two below... */
1276         if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1277                 drbd_send_uuids(mdev);
1278                 drbd_send_state(mdev);
1279         }
1280         if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1281                 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1282
1283         /* Lost contact to peer's copy of the data */
1284         if ((os.pdsk >= D_INCONSISTENT &&
1285              os.pdsk != D_UNKNOWN &&
1286              os.pdsk != D_OUTDATED)
1287         &&  (ns.pdsk < D_INCONSISTENT ||
1288              ns.pdsk == D_UNKNOWN ||
1289              ns.pdsk == D_OUTDATED)) {
1290                 if (get_ldev(mdev)) {
1291                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1292                             mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1293                                 if (mdev->state.susp) {
1294                                         set_bit(NEW_CUR_UUID, &mdev->flags);
1295                                 } else {
1296                                         drbd_uuid_new_current(mdev);
1297                                         drbd_send_uuids(mdev);
1298                                 }
1299                         }
1300                         put_ldev(mdev);
1301                 }
1302         }
1303
1304         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1305                 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1306                         drbd_uuid_new_current(mdev);
1307                         drbd_send_uuids(mdev);
1308                 }
1309
1310                 /* D_DISKLESS Peer becomes secondary */
1311                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1312                         drbd_al_to_on_disk_bm(mdev);
1313                 put_ldev(mdev);
1314         }
1315
1316         /* Last part of the attaching process ... */
1317         if (ns.conn >= C_CONNECTED &&
1318             os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1319                 drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1320                 drbd_send_uuids(mdev);
1321                 drbd_send_state(mdev);
1322         }
1323
1324         /* We want to pause/continue resync, tell peer. */
1325         if (ns.conn >= C_CONNECTED &&
1326              ((os.aftr_isp != ns.aftr_isp) ||
1327               (os.user_isp != ns.user_isp)))
1328                 drbd_send_state(mdev);
1329
1330         /* In case one of the isp bits got set, suspend other devices. */
1331         if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1332             (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1333                 suspend_other_sg(mdev);
1334
1335         /* Make sure the peer gets informed about eventual state
1336            changes (ISP bits) while we were in WFReportParams. */
1337         if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1338                 drbd_send_state(mdev);
1339
1340         /* We are in the progress to start a full sync... */
1341         if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1342             (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1343                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1344
1345         /* We are invalidating our self... */
1346         if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1347             os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1348                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1349
1350         if (os.disk > D_FAILED && ns.disk == D_FAILED) {
1351                 enum drbd_io_error_p eh;
1352
1353                 eh = EP_PASS_ON;
1354                 if (get_ldev_if_state(mdev, D_FAILED)) {
1355                         eh = mdev->ldev->dc.on_io_error;
1356                         put_ldev(mdev);
1357                 }
1358
1359                 drbd_rs_cancel_all(mdev);
1360                 /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
1361                    and it is D_DISKLESS here, local_cnt can only go down, it can
1362                    not increase... It will reach zero */
1363                 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1364                 mdev->rs_total = 0;
1365                 mdev->rs_failed = 0;
1366                 atomic_set(&mdev->rs_pending_cnt, 0);
1367
1368                 spin_lock_irq(&mdev->req_lock);
1369                 _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
1370                 spin_unlock_irq(&mdev->req_lock);
1371
1372                 if (eh == EP_CALL_HELPER)
1373                         drbd_khelper(mdev, "local-io-error");
1374         }
1375
1376         if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
1377
1378                 if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
1379                         if (drbd_send_state(mdev))
1380                                 dev_warn(DEV, "Notified peer that my disk is broken.\n");
1381                         else
1382                                 dev_err(DEV, "Sending state in drbd_io_error() failed\n");
1383                 }
1384
1385                 wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
1386                 lc_destroy(mdev->resync);
1387                 mdev->resync = NULL;
1388                 lc_destroy(mdev->act_log);
1389                 mdev->act_log = NULL;
1390                 __no_warn(local,
1391                         drbd_free_bc(mdev->ldev);
1392                         mdev->ldev = NULL;);
1393
1394                 if (mdev->md_io_tmpp)
1395                         __free_page(mdev->md_io_tmpp);
1396         }
1397
1398         /* Disks got bigger while they were detached */
1399         if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1400             test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1401                 if (ns.conn == C_CONNECTED)
1402                         resync_after_online_grow(mdev);
1403         }
1404
1405         /* A resync finished or aborted, wake paused devices... */
1406         if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1407             (os.peer_isp && !ns.peer_isp) ||
1408             (os.user_isp && !ns.user_isp))
1409                 resume_next_sg(mdev);
1410
1411         /* free tl_hash if we Got thawed and are C_STANDALONE */
1412         if (ns.conn == C_STANDALONE && ns.susp == 0 && mdev->tl_hash)
1413                 drbd_free_tl_hash(mdev);
1414
1415         /* Upon network connection, we need to start the receiver */
1416         if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1417                 drbd_thread_start(&mdev->receiver);
1418
1419         /* Terminate worker thread if we are unconfigured - it will be
1420            restarted as needed... */
1421         if (ns.disk == D_DISKLESS &&
1422             ns.conn == C_STANDALONE &&
1423             ns.role == R_SECONDARY) {
1424                 if (os.aftr_isp != ns.aftr_isp)
1425                         resume_next_sg(mdev);
1426                 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1427                 if (test_bit(DEVICE_DYING, &mdev->flags))
1428                         drbd_thread_stop_nowait(&mdev->worker);
1429         }
1430
1431         drbd_md_sync(mdev);
1432 }
1433
1434
1435 static int drbd_thread_setup(void *arg)
1436 {
1437         struct drbd_thread *thi = (struct drbd_thread *) arg;
1438         struct drbd_conf *mdev = thi->mdev;
1439         unsigned long flags;
1440         int retval;
1441
1442 restart:
1443         retval = thi->function(thi);
1444
1445         spin_lock_irqsave(&thi->t_lock, flags);
1446
1447         /* if the receiver has been "Exiting", the last thing it did
1448          * was set the conn state to "StandAlone",
1449          * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1450          * and receiver thread will be "started".
1451          * drbd_thread_start needs to set "Restarting" in that case.
1452          * t_state check and assignment needs to be within the same spinlock,
1453          * so either thread_start sees Exiting, and can remap to Restarting,
1454          * or thread_start see None, and can proceed as normal.
1455          */
1456
1457         if (thi->t_state == Restarting) {
1458                 dev_info(DEV, "Restarting %s\n", current->comm);
1459                 thi->t_state = Running;
1460                 spin_unlock_irqrestore(&thi->t_lock, flags);
1461                 goto restart;
1462         }
1463
1464         thi->task = NULL;
1465         thi->t_state = None;
1466         smp_mb();
1467         complete(&thi->stop);
1468         spin_unlock_irqrestore(&thi->t_lock, flags);
1469
1470         dev_info(DEV, "Terminating %s\n", current->comm);
1471
1472         /* Release mod reference taken when thread was started */
1473         module_put(THIS_MODULE);
1474         return retval;
1475 }
1476
1477 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1478                       int (*func) (struct drbd_thread *))
1479 {
1480         spin_lock_init(&thi->t_lock);
1481         thi->task    = NULL;
1482         thi->t_state = None;
1483         thi->function = func;
1484         thi->mdev = mdev;
1485 }
1486
1487 int drbd_thread_start(struct drbd_thread *thi)
1488 {
1489         struct drbd_conf *mdev = thi->mdev;
1490         struct task_struct *nt;
1491         unsigned long flags;
1492
1493         const char *me =
1494                 thi == &mdev->receiver ? "receiver" :
1495                 thi == &mdev->asender  ? "asender"  :
1496                 thi == &mdev->worker   ? "worker"   : "NONSENSE";
1497
1498         /* is used from state engine doing drbd_thread_stop_nowait,
1499          * while holding the req lock irqsave */
1500         spin_lock_irqsave(&thi->t_lock, flags);
1501
1502         switch (thi->t_state) {
1503         case None:
1504                 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1505                                 me, current->comm, current->pid);
1506
1507                 /* Get ref on module for thread - this is released when thread exits */
1508                 if (!try_module_get(THIS_MODULE)) {
1509                         dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1510                         spin_unlock_irqrestore(&thi->t_lock, flags);
1511                         return FALSE;
1512                 }
1513
1514                 init_completion(&thi->stop);
1515                 D_ASSERT(thi->task == NULL);
1516                 thi->reset_cpu_mask = 1;
1517                 thi->t_state = Running;
1518                 spin_unlock_irqrestore(&thi->t_lock, flags);
1519                 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1520
1521                 nt = kthread_create(drbd_thread_setup, (void *) thi,
1522                                     "drbd%d_%s", mdev_to_minor(mdev), me);
1523
1524                 if (IS_ERR(nt)) {
1525                         dev_err(DEV, "Couldn't start thread\n");
1526
1527                         module_put(THIS_MODULE);
1528                         return FALSE;
1529                 }
1530                 spin_lock_irqsave(&thi->t_lock, flags);
1531                 thi->task = nt;
1532                 thi->t_state = Running;
1533                 spin_unlock_irqrestore(&thi->t_lock, flags);
1534                 wake_up_process(nt);
1535                 break;
1536         case Exiting:
1537                 thi->t_state = Restarting;
1538                 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1539                                 me, current->comm, current->pid);
1540                 /* fall through */
1541         case Running:
1542         case Restarting:
1543         default:
1544                 spin_unlock_irqrestore(&thi->t_lock, flags);
1545                 break;
1546         }
1547
1548         return TRUE;
1549 }
1550
1551
1552 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1553 {
1554         unsigned long flags;
1555
1556         enum drbd_thread_state ns = restart ? Restarting : Exiting;
1557
1558         /* may be called from state engine, holding the req lock irqsave */
1559         spin_lock_irqsave(&thi->t_lock, flags);
1560
1561         if (thi->t_state == None) {
1562                 spin_unlock_irqrestore(&thi->t_lock, flags);
1563                 if (restart)
1564                         drbd_thread_start(thi);
1565                 return;
1566         }
1567
1568         if (thi->t_state != ns) {
1569                 if (thi->task == NULL) {
1570                         spin_unlock_irqrestore(&thi->t_lock, flags);
1571                         return;
1572                 }
1573
1574                 thi->t_state = ns;
1575                 smp_mb();
1576                 init_completion(&thi->stop);
1577                 if (thi->task != current)
1578                         force_sig(DRBD_SIGKILL, thi->task);
1579
1580         }
1581
1582         spin_unlock_irqrestore(&thi->t_lock, flags);
1583
1584         if (wait)
1585                 wait_for_completion(&thi->stop);
1586 }
1587
1588 #ifdef CONFIG_SMP
1589 /**
1590  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1591  * @mdev:       DRBD device.
1592  *
1593  * Forces all threads of a device onto the same CPU. This is beneficial for
1594  * DRBD's performance. May be overwritten by user's configuration.
1595  */
1596 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1597 {
1598         int ord, cpu;
1599
1600         /* user override. */
1601         if (cpumask_weight(mdev->cpu_mask))
1602                 return;
1603
1604         ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1605         for_each_online_cpu(cpu) {
1606                 if (ord-- == 0) {
1607                         cpumask_set_cpu(cpu, mdev->cpu_mask);
1608                         return;
1609                 }
1610         }
1611         /* should not be reached */
1612         cpumask_setall(mdev->cpu_mask);
1613 }
1614
1615 /**
1616  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1617  * @mdev:       DRBD device.
1618  *
1619  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1620  * prematurely.
1621  */
1622 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1623 {
1624         struct task_struct *p = current;
1625         struct drbd_thread *thi =
1626                 p == mdev->asender.task  ? &mdev->asender  :
1627                 p == mdev->receiver.task ? &mdev->receiver :
1628                 p == mdev->worker.task   ? &mdev->worker   :
1629                 NULL;
1630         ERR_IF(thi == NULL)
1631                 return;
1632         if (!thi->reset_cpu_mask)
1633                 return;
1634         thi->reset_cpu_mask = 0;
1635         set_cpus_allowed_ptr(p, mdev->cpu_mask);
1636 }
1637 #endif
1638
1639 /* the appropriate socket mutex must be held already */
1640 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1641                           enum drbd_packets cmd, struct p_header *h,
1642                           size_t size, unsigned msg_flags)
1643 {
1644         int sent, ok;
1645
1646         ERR_IF(!h) return FALSE;
1647         ERR_IF(!size) return FALSE;
1648
1649         h->magic   = BE_DRBD_MAGIC;
1650         h->command = cpu_to_be16(cmd);
1651         h->length  = cpu_to_be16(size-sizeof(struct p_header));
1652
1653         sent = drbd_send(mdev, sock, h, size, msg_flags);
1654
1655         ok = (sent == size);
1656         if (!ok)
1657                 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1658                     cmdname(cmd), (int)size, sent);
1659         return ok;
1660 }
1661
1662 /* don't pass the socket. we may only look at it
1663  * when we hold the appropriate socket mutex.
1664  */
1665 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1666                   enum drbd_packets cmd, struct p_header *h, size_t size)
1667 {
1668         int ok = 0;
1669         struct socket *sock;
1670
1671         if (use_data_socket) {
1672                 mutex_lock(&mdev->data.mutex);
1673                 sock = mdev->data.socket;
1674         } else {
1675                 mutex_lock(&mdev->meta.mutex);
1676                 sock = mdev->meta.socket;
1677         }
1678
1679         /* drbd_disconnect() could have called drbd_free_sock()
1680          * while we were waiting in down()... */
1681         if (likely(sock != NULL))
1682                 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1683
1684         if (use_data_socket)
1685                 mutex_unlock(&mdev->data.mutex);
1686         else
1687                 mutex_unlock(&mdev->meta.mutex);
1688         return ok;
1689 }
1690
1691 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1692                    size_t size)
1693 {
1694         struct p_header h;
1695         int ok;
1696
1697         h.magic   = BE_DRBD_MAGIC;
1698         h.command = cpu_to_be16(cmd);
1699         h.length  = cpu_to_be16(size);
1700
1701         if (!drbd_get_data_sock(mdev))
1702                 return 0;
1703
1704         ok = (sizeof(h) ==
1705                 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1706         ok = ok && (size ==
1707                 drbd_send(mdev, mdev->data.socket, data, size, 0));
1708
1709         drbd_put_data_sock(mdev);
1710
1711         return ok;
1712 }
1713
1714 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1715 {
1716         struct p_rs_param_95 *p;
1717         struct socket *sock;
1718         int size, rv;
1719         const int apv = mdev->agreed_pro_version;
1720
1721         size = apv <= 87 ? sizeof(struct p_rs_param)
1722                 : apv == 88 ? sizeof(struct p_rs_param)
1723                         + strlen(mdev->sync_conf.verify_alg) + 1
1724                 : apv <= 94 ? sizeof(struct p_rs_param_89)
1725                 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
1726
1727         /* used from admin command context and receiver/worker context.
1728          * to avoid kmalloc, grab the socket right here,
1729          * then use the pre-allocated sbuf there */
1730         mutex_lock(&mdev->data.mutex);
1731         sock = mdev->data.socket;
1732
1733         if (likely(sock != NULL)) {
1734                 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1735
1736                 p = &mdev->data.sbuf.rs_param_95;
1737
1738                 /* initialize verify_alg and csums_alg */
1739                 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1740
1741                 p->rate = cpu_to_be32(sc->rate);
1742                 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1743                 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1744                 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1745                 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1746
1747                 if (apv >= 88)
1748                         strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1749                 if (apv >= 89)
1750                         strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1751
1752                 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1753         } else
1754                 rv = 0; /* not ok */
1755
1756         mutex_unlock(&mdev->data.mutex);
1757
1758         return rv;
1759 }
1760
1761 int drbd_send_protocol(struct drbd_conf *mdev)
1762 {
1763         struct p_protocol *p;
1764         int size, cf, rv;
1765
1766         size = sizeof(struct p_protocol);
1767
1768         if (mdev->agreed_pro_version >= 87)
1769                 size += strlen(mdev->net_conf->integrity_alg) + 1;
1770
1771         /* we must not recurse into our own queue,
1772          * as that is blocked during handshake */
1773         p = kmalloc(size, GFP_NOIO);
1774         if (p == NULL)
1775                 return 0;
1776
1777         p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1778         p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1779         p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1780         p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1781         p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1782
1783         cf = 0;
1784         if (mdev->net_conf->want_lose)
1785                 cf |= CF_WANT_LOSE;
1786         if (mdev->net_conf->dry_run) {
1787                 if (mdev->agreed_pro_version >= 92)
1788                         cf |= CF_DRY_RUN;
1789                 else {
1790                         dev_err(DEV, "--dry-run is not supported by peer");
1791                         kfree(p);
1792                         return 0;
1793                 }
1794         }
1795         p->conn_flags    = cpu_to_be32(cf);
1796
1797         if (mdev->agreed_pro_version >= 87)
1798                 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1799
1800         rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1801                            (struct p_header *)p, size);
1802         kfree(p);
1803         return rv;
1804 }
1805
1806 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1807 {
1808         struct p_uuids p;
1809         int i;
1810
1811         if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1812                 return 1;
1813
1814         for (i = UI_CURRENT; i < UI_SIZE; i++)
1815                 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1816
1817         mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1818         p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1819         uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1820         uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1821         uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1822         p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1823
1824         put_ldev(mdev);
1825
1826         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1827                              (struct p_header *)&p, sizeof(p));
1828 }
1829
1830 int drbd_send_uuids(struct drbd_conf *mdev)
1831 {
1832         return _drbd_send_uuids(mdev, 0);
1833 }
1834
1835 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1836 {
1837         return _drbd_send_uuids(mdev, 8);
1838 }
1839
1840
1841 int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1842 {
1843         struct p_rs_uuid p;
1844
1845         p.uuid = cpu_to_be64(val);
1846
1847         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1848                              (struct p_header *)&p, sizeof(p));
1849 }
1850
1851 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1852 {
1853         struct p_sizes p;
1854         sector_t d_size, u_size;
1855         int q_order_type;
1856         int ok;
1857
1858         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1859                 D_ASSERT(mdev->ldev->backing_bdev);
1860                 d_size = drbd_get_max_capacity(mdev->ldev);
1861                 u_size = mdev->ldev->dc.disk_size;
1862                 q_order_type = drbd_queue_order_type(mdev);
1863                 put_ldev(mdev);
1864         } else {
1865                 d_size = 0;
1866                 u_size = 0;
1867                 q_order_type = QUEUE_ORDERED_NONE;
1868         }
1869
1870         p.d_size = cpu_to_be64(d_size);
1871         p.u_size = cpu_to_be64(u_size);
1872         p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1873         p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
1874         p.queue_order_type = cpu_to_be16(q_order_type);
1875         p.dds_flags = cpu_to_be16(flags);
1876
1877         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1878                            (struct p_header *)&p, sizeof(p));
1879         return ok;
1880 }
1881
1882 /**
1883  * drbd_send_state() - Sends the drbd state to the peer
1884  * @mdev:       DRBD device.
1885  */
1886 int drbd_send_state(struct drbd_conf *mdev)
1887 {
1888         struct socket *sock;
1889         struct p_state p;
1890         int ok = 0;
1891
1892         /* Grab state lock so we wont send state if we're in the middle
1893          * of a cluster wide state change on another thread */
1894         drbd_state_lock(mdev);
1895
1896         mutex_lock(&mdev->data.mutex);
1897
1898         p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1899         sock = mdev->data.socket;
1900
1901         if (likely(sock != NULL)) {
1902                 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1903                                     (struct p_header *)&p, sizeof(p), 0);
1904         }
1905
1906         mutex_unlock(&mdev->data.mutex);
1907
1908         drbd_state_unlock(mdev);
1909         return ok;
1910 }
1911
1912 int drbd_send_state_req(struct drbd_conf *mdev,
1913         union drbd_state mask, union drbd_state val)
1914 {
1915         struct p_req_state p;
1916
1917         p.mask    = cpu_to_be32(mask.i);
1918         p.val     = cpu_to_be32(val.i);
1919
1920         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
1921                              (struct p_header *)&p, sizeof(p));
1922 }
1923
1924 int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
1925 {
1926         struct p_req_state_reply p;
1927
1928         p.retcode    = cpu_to_be32(retcode);
1929
1930         return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
1931                              (struct p_header *)&p, sizeof(p));
1932 }
1933
1934 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
1935         struct p_compressed_bm *p,
1936         struct bm_xfer_ctx *c)
1937 {
1938         struct bitstream bs;
1939         unsigned long plain_bits;
1940         unsigned long tmp;
1941         unsigned long rl;
1942         unsigned len;
1943         unsigned toggle;
1944         int bits;
1945
1946         /* may we use this feature? */
1947         if ((mdev->sync_conf.use_rle == 0) ||
1948                 (mdev->agreed_pro_version < 90))
1949                         return 0;
1950
1951         if (c->bit_offset >= c->bm_bits)
1952                 return 0; /* nothing to do. */
1953
1954         /* use at most thus many bytes */
1955         bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
1956         memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
1957         /* plain bits covered in this code string */
1958         plain_bits = 0;
1959
1960         /* p->encoding & 0x80 stores whether the first run length is set.
1961          * bit offset is implicit.
1962          * start with toggle == 2 to be able to tell the first iteration */
1963         toggle = 2;
1964
1965         /* see how much plain bits we can stuff into one packet
1966          * using RLE and VLI. */
1967         do {
1968                 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
1969                                     : _drbd_bm_find_next(mdev, c->bit_offset);
1970                 if (tmp == -1UL)
1971                         tmp = c->bm_bits;
1972                 rl = tmp - c->bit_offset;
1973
1974                 if (toggle == 2) { /* first iteration */
1975                         if (rl == 0) {
1976                                 /* the first checked bit was set,
1977                                  * store start value, */
1978                                 DCBP_set_start(p, 1);
1979                                 /* but skip encoding of zero run length */
1980                                 toggle = !toggle;
1981                                 continue;
1982                         }
1983                         DCBP_set_start(p, 0);
1984                 }
1985
1986                 /* paranoia: catch zero runlength.
1987                  * can only happen if bitmap is modified while we scan it. */
1988                 if (rl == 0) {
1989                         dev_err(DEV, "unexpected zero runlength while encoding bitmap "
1990                             "t:%u bo:%lu\n", toggle, c->bit_offset);
1991                         return -1;
1992                 }
1993
1994                 bits = vli_encode_bits(&bs, rl);
1995                 if (bits == -ENOBUFS) /* buffer full */
1996                         break;
1997                 if (bits <= 0) {
1998                         dev_err(DEV, "error while encoding bitmap: %d\n", bits);
1999                         return 0;
2000                 }
2001
2002                 toggle = !toggle;
2003                 plain_bits += rl;
2004                 c->bit_offset = tmp;
2005         } while (c->bit_offset < c->bm_bits);
2006
2007         len = bs.cur.b - p->code + !!bs.cur.bit;
2008
2009         if (plain_bits < (len << 3)) {
2010                 /* incompressible with this method.
2011                  * we need to rewind both word and bit position. */
2012                 c->bit_offset -= plain_bits;
2013                 bm_xfer_ctx_bit_to_word_offset(c);
2014                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2015                 return 0;
2016         }
2017
2018         /* RLE + VLI was able to compress it just fine.
2019          * update c->word_offset. */
2020         bm_xfer_ctx_bit_to_word_offset(c);
2021
2022         /* store pad_bits */
2023         DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2024
2025         return len;
2026 }
2027
2028 enum { OK, FAILED, DONE }
2029 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2030         struct p_header *h, struct bm_xfer_ctx *c)
2031 {
2032         struct p_compressed_bm *p = (void*)h;
2033         unsigned long num_words;
2034         int len;
2035         int ok;
2036
2037         len = fill_bitmap_rle_bits(mdev, p, c);
2038
2039         if (len < 0)
2040                 return FAILED;
2041
2042         if (len) {
2043                 DCBP_set_code(p, RLE_VLI_Bits);
2044                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2045                         sizeof(*p) + len, 0);
2046
2047                 c->packets[0]++;
2048                 c->bytes[0] += sizeof(*p) + len;
2049
2050                 if (c->bit_offset >= c->bm_bits)
2051                         len = 0; /* DONE */
2052         } else {
2053                 /* was not compressible.
2054                  * send a buffer full of plain text bits instead. */
2055                 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2056                 len = num_words * sizeof(long);
2057                 if (len)
2058                         drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2059                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2060                                    h, sizeof(struct p_header) + len, 0);
2061                 c->word_offset += num_words;
2062                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2063
2064                 c->packets[1]++;
2065                 c->bytes[1] += sizeof(struct p_header) + len;
2066
2067                 if (c->bit_offset > c->bm_bits)
2068                         c->bit_offset = c->bm_bits;
2069         }
2070         ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2071
2072         if (ok == DONE)
2073                 INFO_bm_xfer_stats(mdev, "send", c);
2074         return ok;
2075 }
2076
2077 /* See the comment at receive_bitmap() */
2078 int _drbd_send_bitmap(struct drbd_conf *mdev)
2079 {
2080         struct bm_xfer_ctx c;
2081         struct p_header *p;
2082         int ret;
2083
2084         ERR_IF(!mdev->bitmap) return FALSE;
2085
2086         /* maybe we should use some per thread scratch page,
2087          * and allocate that during initial device creation? */
2088         p = (struct p_header *) __get_free_page(GFP_NOIO);
2089         if (!p) {
2090                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2091                 return FALSE;
2092         }
2093
2094         if (get_ldev(mdev)) {
2095                 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2096                         dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2097                         drbd_bm_set_all(mdev);
2098                         if (drbd_bm_write(mdev)) {
2099                                 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2100                                  * but otherwise process as per normal - need to tell other
2101                                  * side that a full resync is required! */
2102                                 dev_err(DEV, "Failed to write bitmap to disk!\n");
2103                         } else {
2104                                 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2105                                 drbd_md_sync(mdev);
2106                         }
2107                 }
2108                 put_ldev(mdev);
2109         }
2110
2111         c = (struct bm_xfer_ctx) {
2112                 .bm_bits = drbd_bm_bits(mdev),
2113                 .bm_words = drbd_bm_words(mdev),
2114         };
2115
2116         do {
2117                 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2118         } while (ret == OK);
2119
2120         free_page((unsigned long) p);
2121         return (ret == DONE);
2122 }
2123
2124 int drbd_send_bitmap(struct drbd_conf *mdev)
2125 {
2126         int err;
2127
2128         if (!drbd_get_data_sock(mdev))
2129                 return -1;
2130         err = !_drbd_send_bitmap(mdev);
2131         drbd_put_data_sock(mdev);
2132         return err;
2133 }
2134
2135 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2136 {
2137         int ok;
2138         struct p_barrier_ack p;
2139
2140         p.barrier  = barrier_nr;
2141         p.set_size = cpu_to_be32(set_size);
2142
2143         if (mdev->state.conn < C_CONNECTED)
2144                 return FALSE;
2145         ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2146                         (struct p_header *)&p, sizeof(p));
2147         return ok;
2148 }
2149
2150 /**
2151  * _drbd_send_ack() - Sends an ack packet
2152  * @mdev:       DRBD device.
2153  * @cmd:        Packet command code.
2154  * @sector:     sector, needs to be in big endian byte order
2155  * @blksize:    size in byte, needs to be in big endian byte order
2156  * @block_id:   Id, big endian byte order
2157  */
2158 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2159                           u64 sector,
2160                           u32 blksize,
2161                           u64 block_id)
2162 {
2163         int ok;
2164         struct p_block_ack p;
2165
2166         p.sector   = sector;
2167         p.block_id = block_id;
2168         p.blksize  = blksize;
2169         p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2170
2171         if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2172                 return FALSE;
2173         ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2174                                 (struct p_header *)&p, sizeof(p));
2175         return ok;
2176 }
2177
2178 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2179                      struct p_data *dp)
2180 {
2181         const int header_size = sizeof(struct p_data)
2182                               - sizeof(struct p_header);
2183         int data_size  = ((struct p_header *)dp)->length - header_size;
2184
2185         return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2186                               dp->block_id);
2187 }
2188
2189 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2190                      struct p_block_req *rp)
2191 {
2192         return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2193 }
2194
2195 /**
2196  * drbd_send_ack() - Sends an ack packet
2197  * @mdev:       DRBD device.
2198  * @cmd:        Packet command code.
2199  * @e:          Epoch entry.
2200  */
2201 int drbd_send_ack(struct drbd_conf *mdev,
2202         enum drbd_packets cmd, struct drbd_epoch_entry *e)
2203 {
2204         return _drbd_send_ack(mdev, cmd,
2205                               cpu_to_be64(e->sector),
2206                               cpu_to_be32(e->size),
2207                               e->block_id);
2208 }
2209
2210 /* This function misuses the block_id field to signal if the blocks
2211  * are is sync or not. */
2212 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2213                      sector_t sector, int blksize, u64 block_id)
2214 {
2215         return _drbd_send_ack(mdev, cmd,
2216                               cpu_to_be64(sector),
2217                               cpu_to_be32(blksize),
2218                               cpu_to_be64(block_id));
2219 }
2220
2221 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2222                        sector_t sector, int size, u64 block_id)
2223 {
2224         int ok;
2225         struct p_block_req p;
2226
2227         p.sector   = cpu_to_be64(sector);
2228         p.block_id = block_id;
2229         p.blksize  = cpu_to_be32(size);
2230
2231         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2232                                 (struct p_header *)&p, sizeof(p));
2233         return ok;
2234 }
2235
2236 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2237                             sector_t sector, int size,
2238                             void *digest, int digest_size,
2239                             enum drbd_packets cmd)
2240 {
2241         int ok;
2242         struct p_block_req p;
2243
2244         p.sector   = cpu_to_be64(sector);
2245         p.block_id = BE_DRBD_MAGIC + 0xbeef;
2246         p.blksize  = cpu_to_be32(size);
2247
2248         p.head.magic   = BE_DRBD_MAGIC;
2249         p.head.command = cpu_to_be16(cmd);
2250         p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
2251
2252         mutex_lock(&mdev->data.mutex);
2253
2254         ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2255         ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2256
2257         mutex_unlock(&mdev->data.mutex);
2258
2259         return ok;
2260 }
2261
2262 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2263 {
2264         int ok;
2265         struct p_block_req p;
2266
2267         p.sector   = cpu_to_be64(sector);
2268         p.block_id = BE_DRBD_MAGIC + 0xbabe;
2269         p.blksize  = cpu_to_be32(size);
2270
2271         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2272                            (struct p_header *)&p, sizeof(p));
2273         return ok;
2274 }
2275
2276 /* called on sndtimeo
2277  * returns FALSE if we should retry,
2278  * TRUE if we think connection is dead
2279  */
2280 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2281 {
2282         int drop_it;
2283         /* long elapsed = (long)(jiffies - mdev->last_received); */
2284
2285         drop_it =   mdev->meta.socket == sock
2286                 || !mdev->asender.task
2287                 || get_t_state(&mdev->asender) != Running
2288                 || mdev->state.conn < C_CONNECTED;
2289
2290         if (drop_it)
2291                 return TRUE;
2292
2293         drop_it = !--mdev->ko_count;
2294         if (!drop_it) {
2295                 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2296                        current->comm, current->pid, mdev->ko_count);
2297                 request_ping(mdev);
2298         }
2299
2300         return drop_it; /* && (mdev->state == R_PRIMARY) */;
2301 }
2302
2303 /* The idea of sendpage seems to be to put some kind of reference
2304  * to the page into the skb, and to hand it over to the NIC. In
2305  * this process get_page() gets called.
2306  *
2307  * As soon as the page was really sent over the network put_page()
2308  * gets called by some part of the network layer. [ NIC driver? ]
2309  *
2310  * [ get_page() / put_page() increment/decrement the count. If count
2311  *   reaches 0 the page will be freed. ]
2312  *
2313  * This works nicely with pages from FSs.
2314  * But this means that in protocol A we might signal IO completion too early!
2315  *
2316  * In order not to corrupt data during a resync we must make sure
2317  * that we do not reuse our own buffer pages (EEs) to early, therefore
2318  * we have the net_ee list.
2319  *
2320  * XFS seems to have problems, still, it submits pages with page_count == 0!
2321  * As a workaround, we disable sendpage on pages
2322  * with page_count == 0 or PageSlab.
2323  */
2324 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2325                    int offset, size_t size, unsigned msg_flags)
2326 {
2327         int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2328         kunmap(page);
2329         if (sent == size)
2330                 mdev->send_cnt += size>>9;
2331         return sent == size;
2332 }
2333
2334 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2335                     int offset, size_t size, unsigned msg_flags)
2336 {
2337         mm_segment_t oldfs = get_fs();
2338         int sent, ok;
2339         int len = size;
2340
2341         /* e.g. XFS meta- & log-data is in slab pages, which have a
2342          * page_count of 0 and/or have PageSlab() set.
2343          * we cannot use send_page for those, as that does get_page();
2344          * put_page(); and would cause either a VM_BUG directly, or
2345          * __page_cache_release a page that would actually still be referenced
2346          * by someone, leading to some obscure delayed Oops somewhere else. */
2347         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2348                 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2349
2350         msg_flags |= MSG_NOSIGNAL;
2351         drbd_update_congested(mdev);
2352         set_fs(KERNEL_DS);
2353         do {
2354                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2355                                                         offset, len,
2356                                                         msg_flags);
2357                 if (sent == -EAGAIN) {
2358                         if (we_should_drop_the_connection(mdev,
2359                                                           mdev->data.socket))
2360                                 break;
2361                         else
2362                                 continue;
2363                 }
2364                 if (sent <= 0) {
2365                         dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2366                              __func__, (int)size, len, sent);
2367                         break;
2368                 }
2369                 len    -= sent;
2370                 offset += sent;
2371         } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2372         set_fs(oldfs);
2373         clear_bit(NET_CONGESTED, &mdev->flags);
2374
2375         ok = (len == 0);
2376         if (likely(ok))
2377                 mdev->send_cnt += size>>9;
2378         return ok;
2379 }
2380
2381 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2382 {
2383         struct bio_vec *bvec;
2384         int i;
2385         /* hint all but last page with MSG_MORE */
2386         __bio_for_each_segment(bvec, bio, i, 0) {
2387                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2388                                      bvec->bv_offset, bvec->bv_len,
2389                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2390                         return 0;
2391         }
2392         return 1;
2393 }
2394
2395 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2396 {
2397         struct bio_vec *bvec;
2398         int i;
2399         /* hint all but last page with MSG_MORE */
2400         __bio_for_each_segment(bvec, bio, i, 0) {
2401                 if (!_drbd_send_page(mdev, bvec->bv_page,
2402                                      bvec->bv_offset, bvec->bv_len,
2403                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2404                         return 0;
2405         }
2406         return 1;
2407 }
2408
2409 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2410 {
2411         struct page *page = e->pages;
2412         unsigned len = e->size;
2413         /* hint all but last page with MSG_MORE */
2414         page_chain_for_each(page) {
2415                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2416                 if (!_drbd_send_page(mdev, page, 0, l,
2417                                 page_chain_next(page) ? MSG_MORE : 0))
2418                         return 0;
2419                 len -= l;
2420         }
2421         return 1;
2422 }
2423
2424 /* Used to send write requests
2425  * R_PRIMARY -> Peer    (P_DATA)
2426  */
2427 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2428 {
2429         int ok = 1;
2430         struct p_data p;
2431         unsigned int dp_flags = 0;
2432         void *dgb;
2433         int dgs;
2434
2435         if (!drbd_get_data_sock(mdev))
2436                 return 0;
2437
2438         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2439                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2440
2441         p.head.magic   = BE_DRBD_MAGIC;
2442         p.head.command = cpu_to_be16(P_DATA);
2443         p.head.length  =
2444                 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
2445
2446         p.sector   = cpu_to_be64(req->sector);
2447         p.block_id = (unsigned long)req;
2448         p.seq_num  = cpu_to_be32(req->seq_num =
2449                                  atomic_add_return(1, &mdev->packet_seq));
2450         dp_flags = 0;
2451
2452         /* NOTE: no need to check if barriers supported here as we would
2453          *       not pass the test in make_request_common in that case
2454          */
2455         if (req->master_bio->bi_rw & REQ_HARDBARRIER) {
2456                 dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
2457                 /* dp_flags |= DP_HARDBARRIER; */
2458         }
2459         if (req->master_bio->bi_rw & REQ_SYNC)
2460                 dp_flags |= DP_RW_SYNC;
2461         /* for now handle SYNCIO and UNPLUG
2462          * as if they still were one and the same flag */
2463         if (req->master_bio->bi_rw & REQ_UNPLUG)
2464                 dp_flags |= DP_RW_SYNC;
2465         if (mdev->state.conn >= C_SYNC_SOURCE &&
2466             mdev->state.conn <= C_PAUSED_SYNC_T)
2467                 dp_flags |= DP_MAY_SET_IN_SYNC;
2468
2469         p.dp_flags = cpu_to_be32(dp_flags);
2470         set_bit(UNPLUG_REMOTE, &mdev->flags);
2471         ok = (sizeof(p) ==
2472                 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2473         if (ok && dgs) {
2474                 dgb = mdev->int_dig_out;
2475                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2476                 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2477         }
2478         if (ok) {
2479                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
2480                         ok = _drbd_send_bio(mdev, req->master_bio);
2481                 else
2482                         ok = _drbd_send_zc_bio(mdev, req->master_bio);
2483         }
2484
2485         drbd_put_data_sock(mdev);
2486
2487         return ok;
2488 }
2489
2490 /* answer packet, used to send data back for read requests:
2491  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2492  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2493  */
2494 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2495                     struct drbd_epoch_entry *e)
2496 {
2497         int ok;
2498         struct p_data p;
2499         void *dgb;
2500         int dgs;
2501
2502         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2503                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2504
2505         p.head.magic   = BE_DRBD_MAGIC;
2506         p.head.command = cpu_to_be16(cmd);
2507         p.head.length  =
2508                 cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
2509
2510         p.sector   = cpu_to_be64(e->sector);
2511         p.block_id = e->block_id;
2512         /* p.seq_num  = 0;    No sequence numbers here.. */
2513
2514         /* Only called by our kernel thread.
2515          * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2516          * in response to admin command or module unload.
2517          */
2518         if (!drbd_get_data_sock(mdev))
2519                 return 0;
2520
2521         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
2522                                         sizeof(p), dgs ? MSG_MORE : 0);
2523         if (ok && dgs) {
2524                 dgb = mdev->int_dig_out;
2525                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2526                 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2527         }
2528         if (ok)
2529                 ok = _drbd_send_zc_ee(mdev, e);
2530
2531         drbd_put_data_sock(mdev);
2532
2533         return ok;
2534 }
2535
2536 /*
2537   drbd_send distinguishes two cases:
2538
2539   Packets sent via the data socket "sock"
2540   and packets sent via the meta data socket "msock"
2541
2542                     sock                      msock
2543   -----------------+-------------------------+------------------------------
2544   timeout           conf.timeout / 2          conf.timeout / 2
2545   timeout action    send a ping via msock     Abort communication
2546                                               and close all sockets
2547 */
2548
2549 /*
2550  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2551  */
2552 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2553               void *buf, size_t size, unsigned msg_flags)
2554 {
2555         struct kvec iov;
2556         struct msghdr msg;
2557         int rv, sent = 0;
2558
2559         if (!sock)
2560                 return -1000;
2561
2562         /* THINK  if (signal_pending) return ... ? */
2563
2564         iov.iov_base = buf;
2565         iov.iov_len  = size;
2566
2567         msg.msg_name       = NULL;
2568         msg.msg_namelen    = 0;
2569         msg.msg_control    = NULL;
2570         msg.msg_controllen = 0;
2571         msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2572
2573         if (sock == mdev->data.socket) {
2574                 mdev->ko_count = mdev->net_conf->ko_count;
2575                 drbd_update_congested(mdev);
2576         }
2577         do {
2578                 /* STRANGE
2579                  * tcp_sendmsg does _not_ use its size parameter at all ?
2580                  *
2581                  * -EAGAIN on timeout, -EINTR on signal.
2582                  */
2583 /* THINK
2584  * do we need to block DRBD_SIG if sock == &meta.socket ??
2585  * otherwise wake_asender() might interrupt some send_*Ack !
2586  */
2587                 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2588                 if (rv == -EAGAIN) {
2589                         if (we_should_drop_the_connection(mdev, sock))
2590                                 break;
2591                         else
2592                                 continue;
2593                 }
2594                 D_ASSERT(rv != 0);
2595                 if (rv == -EINTR) {
2596                         flush_signals(current);
2597                         rv = 0;
2598                 }
2599                 if (rv < 0)
2600                         break;
2601                 sent += rv;
2602                 iov.iov_base += rv;
2603                 iov.iov_len  -= rv;
2604         } while (sent < size);
2605
2606         if (sock == mdev->data.socket)
2607                 clear_bit(NET_CONGESTED, &mdev->flags);
2608
2609         if (rv <= 0) {
2610                 if (rv != -EAGAIN) {
2611                         dev_err(DEV, "%s_sendmsg returned %d\n",
2612                             sock == mdev->meta.socket ? "msock" : "sock",
2613                             rv);
2614                         drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2615                 } else
2616                         drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2617         }
2618
2619         return sent;
2620 }
2621
2622 static int drbd_open(struct block_device *bdev, fmode_t mode)
2623 {
2624         struct drbd_conf *mdev = bdev->bd_disk->private_data;
2625         unsigned long flags;
2626         int rv = 0;
2627
2628         lock_kernel();
2629         spin_lock_irqsave(&mdev->req_lock, flags);
2630         /* to have a stable mdev->state.role
2631          * and no race with updating open_cnt */
2632
2633         if (mdev->state.role != R_PRIMARY) {
2634                 if (mode & FMODE_WRITE)
2635                         rv = -EROFS;
2636                 else if (!allow_oos)
2637                         rv = -EMEDIUMTYPE;
2638         }
2639
2640         if (!rv)
2641                 mdev->open_cnt++;
2642         spin_unlock_irqrestore(&mdev->req_lock, flags);
2643         unlock_kernel();
2644
2645         return rv;
2646 }
2647
2648 static int drbd_release(struct gendisk *gd, fmode_t mode)
2649 {
2650         struct drbd_conf *mdev = gd->private_data;
2651         lock_kernel();
2652         mdev->open_cnt--;
2653         unlock_kernel();
2654         return 0;
2655 }
2656
2657 static void drbd_unplug_fn(struct request_queue *q)
2658 {
2659         struct drbd_conf *mdev = q->queuedata;
2660
2661         /* unplug FIRST */
2662         spin_lock_irq(q->queue_lock);
2663         blk_remove_plug(q);
2664         spin_unlock_irq(q->queue_lock);
2665
2666         /* only if connected */
2667         spin_lock_irq(&mdev->req_lock);
2668         if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) {
2669                 D_ASSERT(mdev->state.role == R_PRIMARY);
2670                 if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) {
2671                         /* add to the data.work queue,
2672                          * unless already queued.
2673                          * XXX this might be a good addition to drbd_queue_work
2674                          * anyways, to detect "double queuing" ... */
2675                         if (list_empty(&mdev->unplug_work.list))
2676                                 drbd_queue_work(&mdev->data.work,
2677                                                 &mdev->unplug_work);
2678                 }
2679         }
2680         spin_unlock_irq(&mdev->req_lock);
2681
2682         if (mdev->state.disk >= D_INCONSISTENT)
2683                 drbd_kick_lo(mdev);
2684 }
2685
2686 static void drbd_set_defaults(struct drbd_conf *mdev)
2687 {
2688         /* This way we get a compile error when sync_conf grows,
2689            and we forgot to initialize it here */
2690         mdev->sync_conf = (struct syncer_conf) {
2691                 /* .rate = */           DRBD_RATE_DEF,
2692                 /* .after = */          DRBD_AFTER_DEF,
2693                 /* .al_extents = */     DRBD_AL_EXTENTS_DEF,
2694                 /* .verify_alg = */     {}, 0,
2695                 /* .cpu_mask = */       {}, 0,
2696                 /* .csums_alg = */      {}, 0,
2697                 /* .use_rle = */        0,
2698                 /* .on_no_data = */     DRBD_ON_NO_DATA_DEF,
2699                 /* .c_plan_ahead = */   DRBD_C_PLAN_AHEAD_DEF,
2700                 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2701                 /* .c_fill_target = */  DRBD_C_FILL_TARGET_DEF,
2702                 /* .c_max_rate = */     DRBD_C_MAX_RATE_DEF
2703         };
2704
2705         /* Have to use that way, because the layout differs between
2706            big endian and little endian */
2707         mdev->state = (union drbd_state) {
2708                 { .role = R_SECONDARY,
2709                   .peer = R_UNKNOWN,
2710                   .conn = C_STANDALONE,
2711                   .disk = D_DISKLESS,
2712                   .pdsk = D_UNKNOWN,
2713                   .susp = 0
2714                 } };
2715 }
2716
2717 void drbd_init_set_defaults(struct drbd_conf *mdev)
2718 {
2719         /* the memset(,0,) did most of this.
2720          * note: only assignments, no allocation in here */
2721
2722         drbd_set_defaults(mdev);
2723
2724         /* for now, we do NOT yet support it,
2725          * even though we start some framework
2726          * to eventually support barriers */
2727         set_bit(NO_BARRIER_SUPP, &mdev->flags);
2728
2729         atomic_set(&mdev->ap_bio_cnt, 0);
2730         atomic_set(&mdev->ap_pending_cnt, 0);
2731         atomic_set(&mdev->rs_pending_cnt, 0);
2732         atomic_set(&mdev->unacked_cnt, 0);
2733         atomic_set(&mdev->local_cnt, 0);
2734         atomic_set(&mdev->net_cnt, 0);
2735         atomic_set(&mdev->packet_seq, 0);
2736         atomic_set(&mdev->pp_in_use, 0);
2737         atomic_set(&mdev->rs_sect_in, 0);
2738
2739         mutex_init(&mdev->md_io_mutex);
2740         mutex_init(&mdev->data.mutex);
2741         mutex_init(&mdev->meta.mutex);
2742         sema_init(&mdev->data.work.s, 0);
2743         sema_init(&mdev->meta.work.s, 0);
2744         mutex_init(&mdev->state_mutex);
2745
2746         spin_lock_init(&mdev->data.work.q_lock);
2747         spin_lock_init(&mdev->meta.work.q_lock);
2748
2749         spin_lock_init(&mdev->al_lock);
2750         spin_lock_init(&mdev->req_lock);
2751         spin_lock_init(&mdev->peer_seq_lock);
2752         spin_lock_init(&mdev->epoch_lock);
2753
2754         INIT_LIST_HEAD(&mdev->active_ee);
2755         INIT_LIST_HEAD(&mdev->sync_ee);
2756         INIT_LIST_HEAD(&mdev->done_ee);
2757         INIT_LIST_HEAD(&mdev->read_ee);
2758         INIT_LIST_HEAD(&mdev->net_ee);
2759         INIT_LIST_HEAD(&mdev->resync_reads);
2760         INIT_LIST_HEAD(&mdev->data.work.q);
2761         INIT_LIST_HEAD(&mdev->meta.work.q);
2762         INIT_LIST_HEAD(&mdev->resync_work.list);
2763         INIT_LIST_HEAD(&mdev->unplug_work.list);
2764         INIT_LIST_HEAD(&mdev->md_sync_work.list);
2765         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2766
2767         mdev->resync_work.cb  = w_resync_inactive;
2768         mdev->unplug_work.cb  = w_send_write_hint;
2769         mdev->md_sync_work.cb = w_md_sync;
2770         mdev->bm_io_work.w.cb = w_bitmap_io;
2771         init_timer(&mdev->resync_timer);
2772         init_timer(&mdev->md_sync_timer);
2773         mdev->resync_timer.function = resync_timer_fn;
2774         mdev->resync_timer.data = (unsigned long) mdev;
2775         mdev->md_sync_timer.function = md_sync_timer_fn;
2776         mdev->md_sync_timer.data = (unsigned long) mdev;
2777
2778         init_waitqueue_head(&mdev->misc_wait);
2779         init_waitqueue_head(&mdev->state_wait);
2780         init_waitqueue_head(&mdev->net_cnt_wait);
2781         init_waitqueue_head(&mdev->ee_wait);
2782         init_waitqueue_head(&mdev->al_wait);
2783         init_waitqueue_head(&mdev->seq_wait);
2784
2785         drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2786         drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2787         drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2788
2789         mdev->agreed_pro_version = PRO_VERSION_MAX;
2790         mdev->write_ordering = WO_bio_barrier;
2791         mdev->resync_wenr = LC_FREE;
2792 }
2793
2794 void drbd_mdev_cleanup(struct drbd_conf *mdev)
2795 {
2796         if (mdev->receiver.t_state != None)
2797                 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2798                                 mdev->receiver.t_state);
2799
2800         /* no need to lock it, I'm the only thread alive */
2801         if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
2802                 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2803         mdev->al_writ_cnt  =
2804         mdev->bm_writ_cnt  =
2805         mdev->read_cnt     =
2806         mdev->recv_cnt     =
2807         mdev->send_cnt     =
2808         mdev->writ_cnt     =
2809         mdev->p_size       =
2810         mdev->rs_start     =
2811         mdev->rs_total     =
2812         mdev->rs_failed    =
2813         mdev->rs_mark_left =
2814         mdev->rs_mark_time = 0;
2815         D_ASSERT(mdev->net_conf == NULL);
2816
2817         drbd_set_my_capacity(mdev, 0);
2818         if (mdev->bitmap) {
2819                 /* maybe never allocated. */
2820                 drbd_bm_resize(mdev, 0, 1);
2821                 drbd_bm_cleanup(mdev);
2822         }
2823
2824         drbd_free_resources(mdev);
2825
2826         /*
2827          * currently we drbd_init_ee only on module load, so
2828          * we may do drbd_release_ee only on module unload!
2829          */
2830         D_ASSERT(list_empty(&mdev->active_ee));
2831         D_ASSERT(list_empty(&mdev->sync_ee));
2832         D_ASSERT(list_empty(&mdev->done_ee));
2833         D_ASSERT(list_empty(&mdev->read_ee));
2834         D_ASSERT(list_empty(&mdev->net_ee));
2835         D_ASSERT(list_empty(&mdev->resync_reads));
2836         D_ASSERT(list_empty(&mdev->data.work.q));
2837         D_ASSERT(list_empty(&mdev->meta.work.q));
2838         D_ASSERT(list_empty(&mdev->resync_work.list));
2839         D_ASSERT(list_empty(&mdev->unplug_work.list));
2840
2841 }
2842
2843
2844 static void drbd_destroy_mempools(void)
2845 {
2846         struct page *page;
2847
2848         while (drbd_pp_pool) {
2849                 page = drbd_pp_pool;
2850                 drbd_pp_pool = (struct page *)page_private(page);
2851                 __free_page(page);
2852                 drbd_pp_vacant--;
2853         }
2854
2855         /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2856
2857         if (drbd_ee_mempool)
2858                 mempool_destroy(drbd_ee_mempool);
2859         if (drbd_request_mempool)
2860                 mempool_destroy(drbd_request_mempool);
2861         if (drbd_ee_cache)
2862                 kmem_cache_destroy(drbd_ee_cache);
2863         if (drbd_request_cache)
2864                 kmem_cache_destroy(drbd_request_cache);
2865         if (drbd_bm_ext_cache)
2866                 kmem_cache_destroy(drbd_bm_ext_cache);
2867         if (drbd_al_ext_cache)
2868                 kmem_cache_destroy(drbd_al_ext_cache);
2869
2870         drbd_ee_mempool      = NULL;
2871         drbd_request_mempool = NULL;
2872         drbd_ee_cache        = NULL;
2873         drbd_request_cache   = NULL;
2874         drbd_bm_ext_cache    = NULL;
2875         drbd_al_ext_cache    = NULL;
2876
2877         return;
2878 }
2879
2880 static int drbd_create_mempools(void)
2881 {
2882         struct page *page;
2883         const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count;
2884         int i;
2885
2886         /* prepare our caches and mempools */
2887         drbd_request_mempool = NULL;
2888         drbd_ee_cache        = NULL;
2889         drbd_request_cache   = NULL;
2890         drbd_bm_ext_cache    = NULL;
2891         drbd_al_ext_cache    = NULL;
2892         drbd_pp_pool         = NULL;
2893
2894         /* caches */
2895         drbd_request_cache = kmem_cache_create(
2896                 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2897         if (drbd_request_cache == NULL)
2898                 goto Enomem;
2899
2900         drbd_ee_cache = kmem_cache_create(
2901                 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
2902         if (drbd_ee_cache == NULL)
2903                 goto Enomem;
2904
2905         drbd_bm_ext_cache = kmem_cache_create(
2906                 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2907         if (drbd_bm_ext_cache == NULL)
2908                 goto Enomem;
2909
2910         drbd_al_ext_cache = kmem_cache_create(
2911                 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2912         if (drbd_al_ext_cache == NULL)
2913                 goto Enomem;
2914
2915         /* mempools */
2916         drbd_request_mempool = mempool_create(number,
2917                 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
2918         if (drbd_request_mempool == NULL)
2919                 goto Enomem;
2920
2921         drbd_ee_mempool = mempool_create(number,
2922                 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
2923         if (drbd_request_mempool == NULL)
2924                 goto Enomem;
2925
2926         /* drbd's page pool */
2927         spin_lock_init(&drbd_pp_lock);
2928
2929         for (i = 0; i < number; i++) {
2930                 page = alloc_page(GFP_HIGHUSER);
2931                 if (!page)
2932                         goto Enomem;
2933                 set_page_private(page, (unsigned long)drbd_pp_pool);
2934                 drbd_pp_pool = page;
2935         }
2936         drbd_pp_vacant = number;
2937
2938         return 0;
2939
2940 Enomem:
2941         drbd_destroy_mempools(); /* in case we allocated some */
2942         return -ENOMEM;
2943 }
2944
2945 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
2946         void *unused)
2947 {
2948         /* just so we have it.  you never know what interesting things we
2949          * might want to do here some day...
2950          */
2951
2952         return NOTIFY_DONE;
2953 }
2954
2955 static struct notifier_block drbd_notifier = {
2956         .notifier_call = drbd_notify_sys,
2957 };
2958
2959 static void drbd_release_ee_lists(struct drbd_conf *mdev)
2960 {
2961         int rr;
2962
2963         rr = drbd_release_ee(mdev, &mdev->active_ee);
2964         if (rr)
2965                 dev_err(DEV, "%d EEs in active list found!\n", rr);
2966
2967         rr = drbd_release_ee(mdev, &mdev->sync_ee);
2968         if (rr)
2969                 dev_err(DEV, "%d EEs in sync list found!\n", rr);
2970
2971         rr = drbd_release_ee(mdev, &mdev->read_ee);
2972         if (rr)
2973                 dev_err(DEV, "%d EEs in read list found!\n", rr);
2974
2975         rr = drbd_release_ee(mdev, &mdev->done_ee);
2976         if (rr)
2977                 dev_err(DEV, "%d EEs in done list found!\n", rr);
2978
2979         rr = drbd_release_ee(mdev, &mdev->net_ee);
2980         if (rr)
2981                 dev_err(DEV, "%d EEs in net list found!\n", rr);
2982 }
2983
2984 /* caution. no locking.
2985  * currently only used from module cleanup code. */
2986 static void drbd_delete_device(unsigned int minor)
2987 {
2988         struct drbd_conf *mdev = minor_to_mdev(minor);
2989
2990         if (!mdev)
2991                 return;
2992
2993         /* paranoia asserts */
2994         if (mdev->open_cnt != 0)
2995                 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
2996                                 __FILE__ , __LINE__);
2997
2998         ERR_IF (!list_empty(&mdev->data.work.q)) {
2999                 struct list_head *lp;
3000                 list_for_each(lp, &mdev->data.work.q) {
3001                         dev_err(DEV, "lp = %p\n", lp);
3002                 }
3003         };
3004         /* end paranoia asserts */
3005
3006         del_gendisk(mdev->vdisk);
3007
3008         /* cleanup stuff that may have been allocated during
3009          * device (re-)configuration or state changes */
3010
3011         if (mdev->this_bdev)
3012                 bdput(mdev->this_bdev);
3013
3014         drbd_free_resources(mdev);
3015
3016         drbd_release_ee_lists(mdev);
3017
3018         /* should be free'd on disconnect? */
3019         kfree(mdev->ee_hash);
3020         /*
3021         mdev->ee_hash_s = 0;
3022         mdev->ee_hash = NULL;
3023         */
3024
3025         lc_destroy(mdev->act_log);
3026         lc_destroy(mdev->resync);
3027
3028         kfree(mdev->p_uuid);
3029         /* mdev->p_uuid = NULL; */
3030
3031         kfree(mdev->int_dig_out);
3032         kfree(mdev->int_dig_in);
3033         kfree(mdev->int_dig_vv);
3034
3035         /* cleanup the rest that has been
3036          * allocated from drbd_new_device
3037          * and actually free the mdev itself */
3038         drbd_free_mdev(mdev);
3039 }
3040
3041 static void drbd_cleanup(void)
3042 {
3043         unsigned int i;
3044
3045         unregister_reboot_notifier(&drbd_notifier);
3046
3047         drbd_nl_cleanup();
3048
3049         if (minor_table) {
3050                 if (drbd_proc)
3051                         remove_proc_entry("drbd", NULL);
3052                 i = minor_count;
3053                 while (i--)
3054                         drbd_delete_device(i);
3055                 drbd_destroy_mempools();
3056         }
3057
3058         kfree(minor_table);
3059
3060         unregister_blkdev(DRBD_MAJOR, "drbd");
3061
3062         printk(KERN_INFO "drbd: module cleanup done.\n");
3063 }
3064
3065 /**
3066  * drbd_congested() - Callback for pdflush
3067  * @congested_data:     User data
3068  * @bdi_bits:           Bits pdflush is currently interested in
3069  *
3070  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3071  */
3072 static int drbd_congested(void *congested_data, int bdi_bits)
3073 {
3074         struct drbd_conf *mdev = congested_data;
3075         struct request_queue *q;
3076         char reason = '-';
3077         int r = 0;
3078
3079         if (!__inc_ap_bio_cond(mdev)) {
3080                 /* DRBD has frozen IO */
3081                 r = bdi_bits;
3082                 reason = 'd';
3083                 goto out;
3084         }
3085
3086         if (get_ldev(mdev)) {
3087                 q = bdev_get_queue(mdev->ldev->backing_bdev);
3088                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3089                 put_ldev(mdev);
3090                 if (r)
3091                         reason = 'b';
3092         }
3093
3094         if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3095                 r |= (1 << BDI_async_congested);
3096                 reason = reason == 'b' ? 'a' : 'n';
3097         }
3098
3099 out:
3100         mdev->congestion_reason = reason;
3101         return r;
3102 }
3103
3104 struct drbd_conf *drbd_new_device(unsigned int minor)
3105 {
3106         struct drbd_conf *mdev;
3107         struct gendisk *disk;
3108         struct request_queue *q;
3109
3110         /* GFP_KERNEL, we are outside of all write-out paths */
3111         mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3112         if (!mdev)
3113                 return NULL;
3114         if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3115                 goto out_no_cpumask;
3116
3117         mdev->minor = minor;
3118
3119         drbd_init_set_defaults(mdev);
3120
3121         q = blk_alloc_queue(GFP_KERNEL);
3122         if (!q)
3123                 goto out_no_q;
3124         mdev->rq_queue = q;
3125         q->queuedata   = mdev;
3126
3127         disk = alloc_disk(1);
3128         if (!disk)
3129                 goto out_no_disk;
3130         mdev->vdisk = disk;
3131
3132         set_disk_ro(disk, TRUE);
3133
3134         disk->queue = q;
3135         disk->major = DRBD_MAJOR;
3136         disk->first_minor = minor;
3137         disk->fops = &drbd_ops;
3138         sprintf(disk->disk_name, "drbd%d", minor);
3139         disk->private_data = mdev;
3140
3141         mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3142         /* we have no partitions. we contain only ourselves. */
3143         mdev->this_bdev->bd_contains = mdev->this_bdev;
3144
3145         q->backing_dev_info.congested_fn = drbd_congested;
3146         q->backing_dev_info.congested_data = mdev;
3147
3148         blk_queue_make_request(q, drbd_make_request_26);
3149         blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE);
3150         blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3151         blk_queue_merge_bvec(q, drbd_merge_bvec);
3152         q->queue_lock = &mdev->req_lock; /* needed since we use */
3153                 /* plugging on a queue, that actually has no requests! */
3154         q->unplug_fn = drbd_unplug_fn;
3155
3156         mdev->md_io_page = alloc_page(GFP_KERNEL);
3157         if (!mdev->md_io_page)
3158                 goto out_no_io_page;
3159
3160         if (drbd_bm_init(mdev))
3161                 goto out_no_bitmap;
3162         /* no need to lock access, we are still initializing this minor device. */
3163         if (!tl_init(mdev))
3164                 goto out_no_tl;
3165
3166         mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3167         if (!mdev->app_reads_hash)
3168                 goto out_no_app_reads;
3169
3170         mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3171         if (!mdev->current_epoch)
3172                 goto out_no_epoch;
3173
3174         INIT_LIST_HEAD(&mdev->current_epoch->list);
3175         mdev->epochs = 1;
3176
3177         return mdev;
3178
3179 /* out_whatever_else:
3180         kfree(mdev->current_epoch); */
3181 out_no_epoch:
3182         kfree(mdev->app_reads_hash);
3183 out_no_app_reads:
3184         tl_cleanup(mdev);
3185 out_no_tl:
3186         drbd_bm_cleanup(mdev);
3187 out_no_bitmap:
3188         __free_page(mdev->md_io_page);
3189 out_no_io_page:
3190         put_disk(disk);
3191 out_no_disk:
3192         blk_cleanup_queue(q);
3193 out_no_q:
3194         free_cpumask_var(mdev->cpu_mask);
3195 out_no_cpumask:
3196         kfree(mdev);
3197         return NULL;
3198 }
3199
3200 /* counterpart of drbd_new_device.
3201  * last part of drbd_delete_device. */
3202 void drbd_free_mdev(struct drbd_conf *mdev)
3203 {
3204         kfree(mdev->current_epoch);
3205         kfree(mdev->app_reads_hash);
3206         tl_cleanup(mdev);
3207         if (mdev->bitmap) /* should no longer be there. */
3208                 drbd_bm_cleanup(mdev);
3209         __free_page(mdev->md_io_page);
3210         put_disk(mdev->vdisk);
3211         blk_cleanup_queue(mdev->rq_queue);
3212         free_cpumask_var(mdev->cpu_mask);
3213         kfree(mdev);
3214 }
3215
3216
3217 int __init drbd_init(void)
3218 {
3219         int err;
3220
3221         if (sizeof(struct p_handshake) != 80) {
3222                 printk(KERN_ERR
3223                        "drbd: never change the size or layout "
3224                        "of the HandShake packet.\n");
3225                 return -EINVAL;
3226         }
3227
3228         if (1 > minor_count || minor_count > 255) {
3229                 printk(KERN_ERR
3230                         "drbd: invalid minor_count (%d)\n", minor_count);
3231 #ifdef MODULE
3232                 return -EINVAL;
3233 #else
3234                 minor_count = 8;
3235 #endif
3236         }
3237
3238         err = drbd_nl_init();
3239         if (err)
3240                 return err;
3241
3242         err = register_blkdev(DRBD_MAJOR, "drbd");
3243         if (err) {
3244                 printk(KERN_ERR
3245                        "drbd: unable to register block device major %d\n",
3246                        DRBD_MAJOR);
3247                 return err;
3248         }
3249
3250         register_reboot_notifier(&drbd_notifier);
3251
3252         /*
3253          * allocate all necessary structs
3254          */
3255         err = -ENOMEM;
3256
3257         init_waitqueue_head(&drbd_pp_wait);
3258
3259         drbd_proc = NULL; /* play safe for drbd_cleanup */
3260         minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3261                                 GFP_KERNEL);
3262         if (!minor_table)
3263                 goto Enomem;
3264
3265         err = drbd_create_mempools();
3266         if (err)
3267                 goto Enomem;
3268
3269         drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3270         if (!drbd_proc) {
3271                 printk(KERN_ERR "drbd: unable to register proc file\n");
3272                 goto Enomem;
3273         }
3274
3275         rwlock_init(&global_state_lock);
3276
3277         printk(KERN_INFO "drbd: initialized. "
3278                "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3279                API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3280         printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3281         printk(KERN_INFO "drbd: registered as block device major %d\n",
3282                 DRBD_MAJOR);
3283         printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3284
3285         return 0; /* Success! */
3286
3287 Enomem:
3288         drbd_cleanup();
3289         if (err == -ENOMEM)
3290                 /* currently always the case */
3291                 printk(KERN_ERR "drbd: ran out of memory\n");
3292         else
3293                 printk(KERN_ERR "drbd: initialization failure\n");
3294         return err;
3295 }
3296
3297 void drbd_free_bc(struct drbd_backing_dev *ldev)
3298 {
3299         if (ldev == NULL)
3300                 return;
3301
3302         bd_release(ldev->backing_bdev);
3303         bd_release(ldev->md_bdev);
3304
3305         fput(ldev->lo_file);
3306         fput(ldev->md_file);
3307
3308         kfree(ldev);
3309 }
3310
3311 void drbd_free_sock(struct drbd_conf *mdev)
3312 {
3313         if (mdev->data.socket) {
3314                 mutex_lock(&mdev->data.mutex);
3315                 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3316                 sock_release(mdev->data.socket);
3317                 mdev->data.socket = NULL;
3318                 mutex_unlock(&mdev->data.mutex);
3319         }
3320         if (mdev->meta.socket) {
3321                 mutex_lock(&mdev->meta.mutex);
3322                 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3323                 sock_release(mdev->meta.socket);
3324                 mdev->meta.socket = NULL;
3325                 mutex_unlock(&mdev->meta.mutex);
3326         }
3327 }
3328
3329
3330 void drbd_free_resources(struct drbd_conf *mdev)
3331 {
3332         crypto_free_hash(mdev->csums_tfm);
3333         mdev->csums_tfm = NULL;
3334         crypto_free_hash(mdev->verify_tfm);
3335         mdev->verify_tfm = NULL;
3336         crypto_free_hash(mdev->cram_hmac_tfm);
3337         mdev->cram_hmac_tfm = NULL;
3338         crypto_free_hash(mdev->integrity_w_tfm);
3339         mdev->integrity_w_tfm = NULL;
3340         crypto_free_hash(mdev->integrity_r_tfm);
3341         mdev->integrity_r_tfm = NULL;
3342
3343         drbd_free_sock(mdev);
3344
3345         __no_warn(local,
3346                   drbd_free_bc(mdev->ldev);
3347                   mdev->ldev = NULL;);
3348 }
3349
3350 /* meta data management */
3351
3352 struct meta_data_on_disk {
3353         u64 la_size;           /* last agreed size. */
3354         u64 uuid[UI_SIZE];   /* UUIDs. */
3355         u64 device_uuid;
3356         u64 reserved_u64_1;
3357         u32 flags;             /* MDF */
3358         u32 magic;
3359         u32 md_size_sect;
3360         u32 al_offset;         /* offset to this block */
3361         u32 al_nr_extents;     /* important for restoring the AL */
3362               /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3363         u32 bm_offset;         /* offset to the bitmap, from here */
3364         u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3365         u32 reserved_u32[4];
3366
3367 } __packed;
3368
3369 /**
3370  * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3371  * @mdev:       DRBD device.
3372  */
3373 void drbd_md_sync(struct drbd_conf *mdev)
3374 {
3375         struct meta_data_on_disk *buffer;
3376         sector_t sector;
3377         int i;
3378
3379         if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3380                 return;
3381         del_timer(&mdev->md_sync_timer);
3382
3383         /* We use here D_FAILED and not D_ATTACHING because we try to write
3384          * metadata even if we detach due to a disk failure! */
3385         if (!get_ldev_if_state(mdev, D_FAILED))
3386                 return;
3387
3388         mutex_lock(&mdev->md_io_mutex);
3389         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3390         memset(buffer, 0, 512);
3391
3392         buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3393         for (i = UI_CURRENT; i < UI_SIZE; i++)
3394                 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3395         buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3396         buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3397
3398         buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3399         buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3400         buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3401         buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3402         buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3403
3404         buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3405
3406         D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3407         sector = mdev->ldev->md.md_offset;
3408
3409         if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3410                 clear_bit(MD_DIRTY, &mdev->flags);
3411         } else {
3412                 /* this was a try anyways ... */
3413                 dev_err(DEV, "meta data update failed!\n");
3414
3415                 drbd_chk_io_error(mdev, 1, TRUE);
3416         }
3417
3418         /* Update mdev->ldev->md.la_size_sect,
3419          * since we updated it on metadata. */
3420         mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3421
3422         mutex_unlock(&mdev->md_io_mutex);
3423         put_ldev(mdev);
3424 }
3425
3426 /**
3427  * drbd_md_read() - Reads in the meta data super block
3428  * @mdev:       DRBD device.
3429  * @bdev:       Device from which the meta data should be read in.
3430  *
3431  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3432  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3433  */
3434 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3435 {
3436         struct meta_data_on_disk *buffer;
3437         int i, rv = NO_ERROR;
3438
3439         if (!get_ldev_if_state(mdev, D_ATTACHING))
3440                 return ERR_IO_MD_DISK;
3441
3442         mutex_lock(&mdev->md_io_mutex);
3443         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3444
3445         if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3446                 /* NOTE: cant do normal error processing here as this is
3447                    called BEFORE disk is attached */
3448                 dev_err(DEV, "Error while reading metadata.\n");
3449                 rv = ERR_IO_MD_DISK;
3450                 goto err;
3451         }
3452
3453         if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3454                 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3455                 rv = ERR_MD_INVALID;
3456                 goto err;
3457         }
3458         if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3459                 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3460                     be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3461                 rv = ERR_MD_INVALID;
3462                 goto err;
3463         }
3464         if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3465                 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3466                     be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3467                 rv = ERR_MD_INVALID;
3468                 goto err;
3469         }
3470         if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3471                 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3472                     be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3473                 rv = ERR_MD_INVALID;
3474                 goto err;
3475         }
3476
3477         if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3478                 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3479                     be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3480                 rv = ERR_MD_INVALID;
3481                 goto err;
3482         }
3483
3484         bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3485         for (i = UI_CURRENT; i < UI_SIZE; i++)
3486                 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3487         bdev->md.flags = be32_to_cpu(buffer->flags);
3488         mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3489         bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3490
3491         if (mdev->sync_conf.al_extents < 7)
3492                 mdev->sync_conf.al_extents = 127;
3493
3494  err:
3495         mutex_unlock(&mdev->md_io_mutex);
3496         put_ldev(mdev);
3497
3498         return rv;
3499 }
3500
3501 /**
3502  * drbd_md_mark_dirty() - Mark meta data super block as dirty
3503  * @mdev:       DRBD device.
3504  *
3505  * Call this function if you change anything that should be written to
3506  * the meta-data super block. This function sets MD_DIRTY, and starts a
3507  * timer that ensures that within five seconds you have to call drbd_md_sync().
3508  */
3509 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3510 {
3511         set_bit(MD_DIRTY, &mdev->flags);
3512         mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3513 }
3514
3515
3516 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3517 {
3518         int i;
3519
3520         for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3521                 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3522 }
3523
3524 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3525 {
3526         if (idx == UI_CURRENT) {
3527                 if (mdev->state.role == R_PRIMARY)
3528                         val |= 1;
3529                 else
3530                         val &= ~((u64)1);
3531
3532                 drbd_set_ed_uuid(mdev, val);
3533         }
3534
3535         mdev->ldev->md.uuid[idx] = val;
3536         drbd_md_mark_dirty(mdev);
3537 }
3538
3539
3540 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3541 {
3542         if (mdev->ldev->md.uuid[idx]) {
3543                 drbd_uuid_move_history(mdev);
3544                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3545         }
3546         _drbd_uuid_set(mdev, idx, val);
3547 }
3548
3549 /**
3550  * drbd_uuid_new_current() - Creates a new current UUID
3551  * @mdev:       DRBD device.
3552  *
3553  * Creates a new current UUID, and rotates the old current UUID into
3554  * the bitmap slot. Causes an incremental resync upon next connect.
3555  */
3556 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3557 {
3558         u64 val;
3559
3560         dev_info(DEV, "Creating new current UUID\n");
3561         D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3562         mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3563
3564         get_random_bytes(&val, sizeof(u64));
3565         _drbd_uuid_set(mdev, UI_CURRENT, val);
3566 }
3567
3568 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3569 {
3570         if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3571                 return;
3572
3573         if (val == 0) {
3574                 drbd_uuid_move_history(mdev);
3575                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3576                 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3577         } else {
3578                 if (mdev->ldev->md.uuid[UI_BITMAP])
3579                         dev_warn(DEV, "bm UUID already set");
3580
3581                 mdev->ldev->md.uuid[UI_BITMAP] = val;
3582                 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3583
3584         }
3585         drbd_md_mark_dirty(mdev);
3586 }
3587
3588 /**
3589  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3590  * @mdev:       DRBD device.
3591  *
3592  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3593  */
3594 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3595 {
3596         int rv = -EIO;
3597
3598         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3599                 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3600                 drbd_md_sync(mdev);
3601                 drbd_bm_set_all(mdev);
3602
3603                 rv = drbd_bm_write(mdev);
3604
3605                 if (!rv) {
3606                         drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3607                         drbd_md_sync(mdev);
3608                 }
3609
3610                 put_ldev(mdev);
3611         }
3612
3613         return rv;
3614 }
3615
3616 /**
3617  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3618  * @mdev:       DRBD device.
3619  *
3620  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3621  */
3622 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3623 {
3624         int rv = -EIO;
3625
3626         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3627                 drbd_bm_clear_all(mdev);
3628                 rv = drbd_bm_write(mdev);
3629                 put_ldev(mdev);
3630         }
3631
3632         return rv;
3633 }
3634
3635 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3636 {
3637         struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3638         int rv;
3639
3640         D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3641
3642         drbd_bm_lock(mdev, work->why);
3643         rv = work->io_fn(mdev);
3644         drbd_bm_unlock(mdev);
3645
3646         clear_bit(BITMAP_IO, &mdev->flags);
3647         wake_up(&mdev->misc_wait);
3648
3649         if (work->done)
3650                 work->done(mdev, rv);
3651
3652         clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3653         work->why = NULL;
3654
3655         return 1;
3656 }
3657
3658 /**
3659  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3660  * @mdev:       DRBD device.
3661  * @io_fn:      IO callback to be called when bitmap IO is possible
3662  * @done:       callback to be called after the bitmap IO was performed
3663  * @why:        Descriptive text of the reason for doing the IO
3664  *
3665  * While IO on the bitmap happens we freeze application IO thus we ensure
3666  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3667  * called from worker context. It MUST NOT be used while a previous such
3668  * work is still pending!
3669  */
3670 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3671                           int (*io_fn)(struct drbd_conf *),
3672                           void (*done)(struct drbd_conf *, int),
3673                           char *why)
3674 {
3675         D_ASSERT(current == mdev->worker.task);
3676
3677         D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3678         D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3679         D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3680         if (mdev->bm_io_work.why)
3681                 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3682                         why, mdev->bm_io_work.why);
3683
3684         mdev->bm_io_work.io_fn = io_fn;
3685         mdev->bm_io_work.done = done;
3686         mdev->bm_io_work.why = why;
3687
3688         set_bit(BITMAP_IO, &mdev->flags);
3689         if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3690                 if (list_empty(&mdev->bm_io_work.w.list)) {
3691                         set_bit(BITMAP_IO_QUEUED, &mdev->flags);
3692                         drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3693                 } else
3694                         dev_err(DEV, "FIXME avoided double queuing bm_io_work\n");
3695         }
3696 }
3697
3698 /**
3699  * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3700  * @mdev:       DRBD device.
3701  * @io_fn:      IO callback to be called when bitmap IO is possible
3702  * @why:        Descriptive text of the reason for doing the IO
3703  *
3704  * freezes application IO while that the actual IO operations runs. This
3705  * functions MAY NOT be called from worker context.
3706  */
3707 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3708 {
3709         int rv;
3710
3711         D_ASSERT(current != mdev->worker.task);
3712
3713         drbd_suspend_io(mdev);
3714
3715         drbd_bm_lock(mdev, why);
3716         rv = io_fn(mdev);
3717         drbd_bm_unlock(mdev);
3718
3719         drbd_resume_io(mdev);
3720
3721         return rv;
3722 }
3723
3724 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3725 {
3726         if ((mdev->ldev->md.flags & flag) != flag) {
3727                 drbd_md_mark_dirty(mdev);
3728                 mdev->ldev->md.flags |= flag;
3729         }
3730 }
3731
3732 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3733 {
3734         if ((mdev->ldev->md.flags & flag) != 0) {
3735                 drbd_md_mark_dirty(mdev);
3736                 mdev->ldev->md.flags &= ~flag;
3737         }
3738 }
3739 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3740 {
3741         return (bdev->md.flags & flag) != 0;
3742 }
3743
3744 static void md_sync_timer_fn(unsigned long data)
3745 {
3746         struct drbd_conf *mdev = (struct drbd_conf *) data;
3747
3748         drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3749 }
3750
3751 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3752 {
3753         dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3754         drbd_md_sync(mdev);
3755
3756         return 1;
3757 }
3758
3759 #ifdef CONFIG_DRBD_FAULT_INJECTION
3760 /* Fault insertion support including random number generator shamelessly
3761  * stolen from kernel/rcutorture.c */
3762 struct fault_random_state {
3763         unsigned long state;
3764         unsigned long count;
3765 };
3766
3767 #define FAULT_RANDOM_MULT 39916801  /* prime */
3768 #define FAULT_RANDOM_ADD        479001701 /* prime */
3769 #define FAULT_RANDOM_REFRESH 10000
3770
3771 /*
3772  * Crude but fast random-number generator.  Uses a linear congruential
3773  * generator, with occasional help from get_random_bytes().
3774  */
3775 static unsigned long
3776 _drbd_fault_random(struct fault_random_state *rsp)
3777 {
3778         long refresh;
3779
3780         if (!rsp->count--) {
3781                 get_random_bytes(&refresh, sizeof(refresh));
3782                 rsp->state += refresh;
3783                 rsp->count = FAULT_RANDOM_REFRESH;
3784         }
3785         rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3786         return swahw32(rsp->state);
3787 }
3788
3789 static char *
3790 _drbd_fault_str(unsigned int type) {
3791         static char *_faults[] = {
3792                 [DRBD_FAULT_MD_WR] = "Meta-data write",
3793                 [DRBD_FAULT_MD_RD] = "Meta-data read",
3794                 [DRBD_FAULT_RS_WR] = "Resync write",
3795                 [DRBD_FAULT_RS_RD] = "Resync read",
3796                 [DRBD_FAULT_DT_WR] = "Data write",
3797                 [DRBD_FAULT_DT_RD] = "Data read",
3798                 [DRBD_FAULT_DT_RA] = "Data read ahead",
3799                 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
3800                 [DRBD_FAULT_AL_EE] = "EE allocation",
3801                 [DRBD_FAULT_RECEIVE] = "receive data corruption",
3802         };
3803
3804         return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3805 }
3806
3807 unsigned int
3808 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
3809 {
3810         static struct fault_random_state rrs = {0, 0};
3811
3812         unsigned int ret = (
3813                 (fault_devs == 0 ||
3814                         ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
3815                 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
3816
3817         if (ret) {
3818                 fault_count++;
3819
3820                 if (__ratelimit(&drbd_ratelimit_state))
3821                         dev_warn(DEV, "***Simulating %s failure\n",
3822                                 _drbd_fault_str(type));
3823         }
3824
3825         return ret;
3826 }
3827 #endif
3828
3829 const char *drbd_buildtag(void)
3830 {
3831         /* DRBD built from external sources has here a reference to the
3832            git hash of the source code. */
3833
3834         static char buildtag[38] = "\0uilt-in";
3835
3836         if (buildtag[0] == 0) {
3837 #ifdef CONFIG_MODULES
3838                 if (THIS_MODULE != NULL)
3839                         sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3840                 else
3841 #endif
3842                         buildtag[0] = 'b';
3843         }
3844
3845         return buildtag;
3846 }
3847
3848 module_init(drbd_init)
3849 module_exit(drbd_cleanup)
3850
3851 EXPORT_SYMBOL(drbd_conn_str);
3852 EXPORT_SYMBOL(drbd_role_str);
3853 EXPORT_SYMBOL(drbd_disk_str);
3854 EXPORT_SYMBOL(drbd_set_st_err_str);