drbd: The new, smarter resync speed controller
[linux-2.6.git] / drivers / block / drbd / drbd_receiver.c
1 /*
2    drbd_receiver.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    drbd is free software; you can redistribute it and/or modify
11    it under the terms of the GNU General Public License as published by
12    the Free Software Foundation; either version 2, or (at your option)
13    any later version.
14
15    drbd is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License for more details.
19
20    You should have received a copy of the GNU General Public License
21    along with drbd; see the file COPYING.  If not, write to
22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24
25
26 #include <linux/module.h>
27
28 #include <asm/uaccess.h>
29 #include <net/sock.h>
30
31 #include <linux/drbd.h>
32 #include <linux/fs.h>
33 #include <linux/file.h>
34 #include <linux/in.h>
35 #include <linux/mm.h>
36 #include <linux/memcontrol.h>
37 #include <linux/mm_inline.h>
38 #include <linux/slab.h>
39 #include <linux/smp_lock.h>
40 #include <linux/pkt_sched.h>
41 #define __KERNEL_SYSCALLS__
42 #include <linux/unistd.h>
43 #include <linux/vmalloc.h>
44 #include <linux/random.h>
45 #include <linux/string.h>
46 #include <linux/scatterlist.h>
47 #include "drbd_int.h"
48 #include "drbd_req.h"
49
50 #include "drbd_vli.h"
51
52 struct flush_work {
53         struct drbd_work w;
54         struct drbd_epoch *epoch;
55 };
56
57 enum finish_epoch {
58         FE_STILL_LIVE,
59         FE_DESTROYED,
60         FE_RECYCLED,
61 };
62
63 static int drbd_do_handshake(struct drbd_conf *mdev);
64 static int drbd_do_auth(struct drbd_conf *mdev);
65
66 static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
67 static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
68
69 static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
70 {
71         struct drbd_epoch *prev;
72         spin_lock(&mdev->epoch_lock);
73         prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
74         if (prev == epoch || prev == mdev->current_epoch)
75                 prev = NULL;
76         spin_unlock(&mdev->epoch_lock);
77         return prev;
78 }
79
80 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
81
82 /*
83  * some helper functions to deal with single linked page lists,
84  * page->private being our "next" pointer.
85  */
86
87 /* If at least n pages are linked at head, get n pages off.
88  * Otherwise, don't modify head, and return NULL.
89  * Locking is the responsibility of the caller.
90  */
91 static struct page *page_chain_del(struct page **head, int n)
92 {
93         struct page *page;
94         struct page *tmp;
95
96         BUG_ON(!n);
97         BUG_ON(!head);
98
99         page = *head;
100
101         if (!page)
102                 return NULL;
103
104         while (page) {
105                 tmp = page_chain_next(page);
106                 if (--n == 0)
107                         break; /* found sufficient pages */
108                 if (tmp == NULL)
109                         /* insufficient pages, don't use any of them. */
110                         return NULL;
111                 page = tmp;
112         }
113
114         /* add end of list marker for the returned list */
115         set_page_private(page, 0);
116         /* actual return value, and adjustment of head */
117         page = *head;
118         *head = tmp;
119         return page;
120 }
121
122 /* may be used outside of locks to find the tail of a (usually short)
123  * "private" page chain, before adding it back to a global chain head
124  * with page_chain_add() under a spinlock. */
125 static struct page *page_chain_tail(struct page *page, int *len)
126 {
127         struct page *tmp;
128         int i = 1;
129         while ((tmp = page_chain_next(page)))
130                 ++i, page = tmp;
131         if (len)
132                 *len = i;
133         return page;
134 }
135
136 static int page_chain_free(struct page *page)
137 {
138         struct page *tmp;
139         int i = 0;
140         page_chain_for_each_safe(page, tmp) {
141                 put_page(page);
142                 ++i;
143         }
144         return i;
145 }
146
147 static void page_chain_add(struct page **head,
148                 struct page *chain_first, struct page *chain_last)
149 {
150 #if 1
151         struct page *tmp;
152         tmp = page_chain_tail(chain_first, NULL);
153         BUG_ON(tmp != chain_last);
154 #endif
155
156         /* add chain to head */
157         set_page_private(chain_last, (unsigned long)*head);
158         *head = chain_first;
159 }
160
161 static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
162 {
163         struct page *page = NULL;
164         struct page *tmp = NULL;
165         int i = 0;
166
167         /* Yes, testing drbd_pp_vacant outside the lock is racy.
168          * So what. It saves a spin_lock. */
169         if (drbd_pp_vacant >= number) {
170                 spin_lock(&drbd_pp_lock);
171                 page = page_chain_del(&drbd_pp_pool, number);
172                 if (page)
173                         drbd_pp_vacant -= number;
174                 spin_unlock(&drbd_pp_lock);
175                 if (page)
176                         return page;
177         }
178
179         /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
180          * "criss-cross" setup, that might cause write-out on some other DRBD,
181          * which in turn might block on the other node at this very place.  */
182         for (i = 0; i < number; i++) {
183                 tmp = alloc_page(GFP_TRY);
184                 if (!tmp)
185                         break;
186                 set_page_private(tmp, (unsigned long)page);
187                 page = tmp;
188         }
189
190         if (i == number)
191                 return page;
192
193         /* Not enough pages immediately available this time.
194          * No need to jump around here, drbd_pp_alloc will retry this
195          * function "soon". */
196         if (page) {
197                 tmp = page_chain_tail(page, NULL);
198                 spin_lock(&drbd_pp_lock);
199                 page_chain_add(&drbd_pp_pool, page, tmp);
200                 drbd_pp_vacant += i;
201                 spin_unlock(&drbd_pp_lock);
202         }
203         return NULL;
204 }
205
206 /* kick lower level device, if we have more than (arbitrary number)
207  * reference counts on it, which typically are locally submitted io
208  * requests.  don't use unacked_cnt, so we speed up proto A and B, too. */
209 static void maybe_kick_lo(struct drbd_conf *mdev)
210 {
211         if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark)
212                 drbd_kick_lo(mdev);
213 }
214
215 static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
216 {
217         struct drbd_epoch_entry *e;
218         struct list_head *le, *tle;
219
220         /* The EEs are always appended to the end of the list. Since
221            they are sent in order over the wire, they have to finish
222            in order. As soon as we see the first not finished we can
223            stop to examine the list... */
224
225         list_for_each_safe(le, tle, &mdev->net_ee) {
226                 e = list_entry(le, struct drbd_epoch_entry, w.list);
227                 if (drbd_ee_has_active_page(e))
228                         break;
229                 list_move(le, to_be_freed);
230         }
231 }
232
233 static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
234 {
235         LIST_HEAD(reclaimed);
236         struct drbd_epoch_entry *e, *t;
237
238         maybe_kick_lo(mdev);
239         spin_lock_irq(&mdev->req_lock);
240         reclaim_net_ee(mdev, &reclaimed);
241         spin_unlock_irq(&mdev->req_lock);
242
243         list_for_each_entry_safe(e, t, &reclaimed, w.list)
244                 drbd_free_ee(mdev, e);
245 }
246
247 /**
248  * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
249  * @mdev:       DRBD device.
250  * @number:     number of pages requested
251  * @retry:      whether to retry, if not enough pages are available right now
252  *
253  * Tries to allocate number pages, first from our own page pool, then from
254  * the kernel, unless this allocation would exceed the max_buffers setting.
255  * Possibly retry until DRBD frees sufficient pages somewhere else.
256  *
257  * Returns a page chain linked via page->private.
258  */
259 static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
260 {
261         struct page *page = NULL;
262         DEFINE_WAIT(wait);
263
264         /* Yes, we may run up to @number over max_buffers. If we
265          * follow it strictly, the admin will get it wrong anyways. */
266         if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
267                 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
268
269         while (page == NULL) {
270                 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
271
272                 drbd_kick_lo_and_reclaim_net(mdev);
273
274                 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
275                         page = drbd_pp_first_pages_or_try_alloc(mdev, number);
276                         if (page)
277                                 break;
278                 }
279
280                 if (!retry)
281                         break;
282
283                 if (signal_pending(current)) {
284                         dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
285                         break;
286                 }
287
288                 schedule();
289         }
290         finish_wait(&drbd_pp_wait, &wait);
291
292         if (page)
293                 atomic_add(number, &mdev->pp_in_use);
294         return page;
295 }
296
297 /* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
298  * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
299  * Either links the page chain back to the global pool,
300  * or returns all pages to the system. */
301 static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
302 {
303         int i;
304         if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
305                 i = page_chain_free(page);
306         else {
307                 struct page *tmp;
308                 tmp = page_chain_tail(page, &i);
309                 spin_lock(&drbd_pp_lock);
310                 page_chain_add(&drbd_pp_pool, page, tmp);
311                 drbd_pp_vacant += i;
312                 spin_unlock(&drbd_pp_lock);
313         }
314         atomic_sub(i, &mdev->pp_in_use);
315         i = atomic_read(&mdev->pp_in_use);
316         if (i < 0)
317                 dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
318         wake_up(&drbd_pp_wait);
319 }
320
321 /*
322 You need to hold the req_lock:
323  _drbd_wait_ee_list_empty()
324
325 You must not have the req_lock:
326  drbd_free_ee()
327  drbd_alloc_ee()
328  drbd_init_ee()
329  drbd_release_ee()
330  drbd_ee_fix_bhs()
331  drbd_process_done_ee()
332  drbd_clear_done_ee()
333  drbd_wait_ee_list_empty()
334 */
335
336 struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
337                                      u64 id,
338                                      sector_t sector,
339                                      unsigned int data_size,
340                                      gfp_t gfp_mask) __must_hold(local)
341 {
342         struct drbd_epoch_entry *e;
343         struct page *page;
344         unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
345
346         if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
347                 return NULL;
348
349         e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
350         if (!e) {
351                 if (!(gfp_mask & __GFP_NOWARN))
352                         dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
353                 return NULL;
354         }
355
356         page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
357         if (!page)
358                 goto fail;
359
360         INIT_HLIST_NODE(&e->colision);
361         e->epoch = NULL;
362         e->mdev = mdev;
363         e->pages = page;
364         atomic_set(&e->pending_bios, 0);
365         e->size = data_size;
366         e->flags = 0;
367         e->sector = sector;
368         e->sector = sector;
369         e->block_id = id;
370
371         return e;
372
373  fail:
374         mempool_free(e, drbd_ee_mempool);
375         return NULL;
376 }
377
378 void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
379 {
380         drbd_pp_free(mdev, e->pages);
381         D_ASSERT(atomic_read(&e->pending_bios) == 0);
382         D_ASSERT(hlist_unhashed(&e->colision));
383         mempool_free(e, drbd_ee_mempool);
384 }
385
386 int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
387 {
388         LIST_HEAD(work_list);
389         struct drbd_epoch_entry *e, *t;
390         int count = 0;
391
392         spin_lock_irq(&mdev->req_lock);
393         list_splice_init(list, &work_list);
394         spin_unlock_irq(&mdev->req_lock);
395
396         list_for_each_entry_safe(e, t, &work_list, w.list) {
397                 drbd_free_ee(mdev, e);
398                 count++;
399         }
400         return count;
401 }
402
403
404 /*
405  * This function is called from _asender only_
406  * but see also comments in _req_mod(,barrier_acked)
407  * and receive_Barrier.
408  *
409  * Move entries from net_ee to done_ee, if ready.
410  * Grab done_ee, call all callbacks, free the entries.
411  * The callbacks typically send out ACKs.
412  */
413 static int drbd_process_done_ee(struct drbd_conf *mdev)
414 {
415         LIST_HEAD(work_list);
416         LIST_HEAD(reclaimed);
417         struct drbd_epoch_entry *e, *t;
418         int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
419
420         spin_lock_irq(&mdev->req_lock);
421         reclaim_net_ee(mdev, &reclaimed);
422         list_splice_init(&mdev->done_ee, &work_list);
423         spin_unlock_irq(&mdev->req_lock);
424
425         list_for_each_entry_safe(e, t, &reclaimed, w.list)
426                 drbd_free_ee(mdev, e);
427
428         /* possible callbacks here:
429          * e_end_block, and e_end_resync_block, e_send_discard_ack.
430          * all ignore the last argument.
431          */
432         list_for_each_entry_safe(e, t, &work_list, w.list) {
433                 /* list_del not necessary, next/prev members not touched */
434                 ok = e->w.cb(mdev, &e->w, !ok) && ok;
435                 drbd_free_ee(mdev, e);
436         }
437         wake_up(&mdev->ee_wait);
438
439         return ok;
440 }
441
442 void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
443 {
444         DEFINE_WAIT(wait);
445
446         /* avoids spin_lock/unlock
447          * and calling prepare_to_wait in the fast path */
448         while (!list_empty(head)) {
449                 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
450                 spin_unlock_irq(&mdev->req_lock);
451                 drbd_kick_lo(mdev);
452                 schedule();
453                 finish_wait(&mdev->ee_wait, &wait);
454                 spin_lock_irq(&mdev->req_lock);
455         }
456 }
457
458 void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
459 {
460         spin_lock_irq(&mdev->req_lock);
461         _drbd_wait_ee_list_empty(mdev, head);
462         spin_unlock_irq(&mdev->req_lock);
463 }
464
465 /* see also kernel_accept; which is only present since 2.6.18.
466  * also we want to log which part of it failed, exactly */
467 static int drbd_accept(struct drbd_conf *mdev, const char **what,
468                 struct socket *sock, struct socket **newsock)
469 {
470         struct sock *sk = sock->sk;
471         int err = 0;
472
473         *what = "listen";
474         err = sock->ops->listen(sock, 5);
475         if (err < 0)
476                 goto out;
477
478         *what = "sock_create_lite";
479         err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
480                                newsock);
481         if (err < 0)
482                 goto out;
483
484         *what = "accept";
485         err = sock->ops->accept(sock, *newsock, 0);
486         if (err < 0) {
487                 sock_release(*newsock);
488                 *newsock = NULL;
489                 goto out;
490         }
491         (*newsock)->ops  = sock->ops;
492
493 out:
494         return err;
495 }
496
497 static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
498                     void *buf, size_t size, int flags)
499 {
500         mm_segment_t oldfs;
501         struct kvec iov = {
502                 .iov_base = buf,
503                 .iov_len = size,
504         };
505         struct msghdr msg = {
506                 .msg_iovlen = 1,
507                 .msg_iov = (struct iovec *)&iov,
508                 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
509         };
510         int rv;
511
512         oldfs = get_fs();
513         set_fs(KERNEL_DS);
514         rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
515         set_fs(oldfs);
516
517         return rv;
518 }
519
520 static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
521 {
522         mm_segment_t oldfs;
523         struct kvec iov = {
524                 .iov_base = buf,
525                 .iov_len = size,
526         };
527         struct msghdr msg = {
528                 .msg_iovlen = 1,
529                 .msg_iov = (struct iovec *)&iov,
530                 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
531         };
532         int rv;
533
534         oldfs = get_fs();
535         set_fs(KERNEL_DS);
536
537         for (;;) {
538                 rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
539                 if (rv == size)
540                         break;
541
542                 /* Note:
543                  * ECONNRESET   other side closed the connection
544                  * ERESTARTSYS  (on  sock) we got a signal
545                  */
546
547                 if (rv < 0) {
548                         if (rv == -ECONNRESET)
549                                 dev_info(DEV, "sock was reset by peer\n");
550                         else if (rv != -ERESTARTSYS)
551                                 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
552                         break;
553                 } else if (rv == 0) {
554                         dev_info(DEV, "sock was shut down by peer\n");
555                         break;
556                 } else  {
557                         /* signal came in, or peer/link went down,
558                          * after we read a partial message
559                          */
560                         /* D_ASSERT(signal_pending(current)); */
561                         break;
562                 }
563         };
564
565         set_fs(oldfs);
566
567         if (rv != size)
568                 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
569
570         return rv;
571 }
572
573 /* quoting tcp(7):
574  *   On individual connections, the socket buffer size must be set prior to the
575  *   listen(2) or connect(2) calls in order to have it take effect.
576  * This is our wrapper to do so.
577  */
578 static void drbd_setbufsize(struct socket *sock, unsigned int snd,
579                 unsigned int rcv)
580 {
581         /* open coded SO_SNDBUF, SO_RCVBUF */
582         if (snd) {
583                 sock->sk->sk_sndbuf = snd;
584                 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
585         }
586         if (rcv) {
587                 sock->sk->sk_rcvbuf = rcv;
588                 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
589         }
590 }
591
592 static struct socket *drbd_try_connect(struct drbd_conf *mdev)
593 {
594         const char *what;
595         struct socket *sock;
596         struct sockaddr_in6 src_in6;
597         int err;
598         int disconnect_on_error = 1;
599
600         if (!get_net_conf(mdev))
601                 return NULL;
602
603         what = "sock_create_kern";
604         err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
605                 SOCK_STREAM, IPPROTO_TCP, &sock);
606         if (err < 0) {
607                 sock = NULL;
608                 goto out;
609         }
610
611         sock->sk->sk_rcvtimeo =
612         sock->sk->sk_sndtimeo =  mdev->net_conf->try_connect_int*HZ;
613         drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
614                         mdev->net_conf->rcvbuf_size);
615
616        /* explicitly bind to the configured IP as source IP
617         *  for the outgoing connections.
618         *  This is needed for multihomed hosts and to be
619         *  able to use lo: interfaces for drbd.
620         * Make sure to use 0 as port number, so linux selects
621         *  a free one dynamically.
622         */
623         memcpy(&src_in6, mdev->net_conf->my_addr,
624                min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
625         if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
626                 src_in6.sin6_port = 0;
627         else
628                 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
629
630         what = "bind before connect";
631         err = sock->ops->bind(sock,
632                               (struct sockaddr *) &src_in6,
633                               mdev->net_conf->my_addr_len);
634         if (err < 0)
635                 goto out;
636
637         /* connect may fail, peer not yet available.
638          * stay C_WF_CONNECTION, don't go Disconnecting! */
639         disconnect_on_error = 0;
640         what = "connect";
641         err = sock->ops->connect(sock,
642                                  (struct sockaddr *)mdev->net_conf->peer_addr,
643                                  mdev->net_conf->peer_addr_len, 0);
644
645 out:
646         if (err < 0) {
647                 if (sock) {
648                         sock_release(sock);
649                         sock = NULL;
650                 }
651                 switch (-err) {
652                         /* timeout, busy, signal pending */
653                 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
654                 case EINTR: case ERESTARTSYS:
655                         /* peer not (yet) available, network problem */
656                 case ECONNREFUSED: case ENETUNREACH:
657                 case EHOSTDOWN:    case EHOSTUNREACH:
658                         disconnect_on_error = 0;
659                         break;
660                 default:
661                         dev_err(DEV, "%s failed, err = %d\n", what, err);
662                 }
663                 if (disconnect_on_error)
664                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
665         }
666         put_net_conf(mdev);
667         return sock;
668 }
669
670 static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
671 {
672         int timeo, err;
673         struct socket *s_estab = NULL, *s_listen;
674         const char *what;
675
676         if (!get_net_conf(mdev))
677                 return NULL;
678
679         what = "sock_create_kern";
680         err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
681                 SOCK_STREAM, IPPROTO_TCP, &s_listen);
682         if (err) {
683                 s_listen = NULL;
684                 goto out;
685         }
686
687         timeo = mdev->net_conf->try_connect_int * HZ;
688         timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
689
690         s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
691         s_listen->sk->sk_rcvtimeo = timeo;
692         s_listen->sk->sk_sndtimeo = timeo;
693         drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
694                         mdev->net_conf->rcvbuf_size);
695
696         what = "bind before listen";
697         err = s_listen->ops->bind(s_listen,
698                               (struct sockaddr *) mdev->net_conf->my_addr,
699                               mdev->net_conf->my_addr_len);
700         if (err < 0)
701                 goto out;
702
703         err = drbd_accept(mdev, &what, s_listen, &s_estab);
704
705 out:
706         if (s_listen)
707                 sock_release(s_listen);
708         if (err < 0) {
709                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
710                         dev_err(DEV, "%s failed, err = %d\n", what, err);
711                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
712                 }
713         }
714         put_net_conf(mdev);
715
716         return s_estab;
717 }
718
719 static int drbd_send_fp(struct drbd_conf *mdev,
720         struct socket *sock, enum drbd_packets cmd)
721 {
722         struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
723
724         return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
725 }
726
727 static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
728 {
729         struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
730         int rr;
731
732         rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
733
734         if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
735                 return be16_to_cpu(h->command);
736
737         return 0xffff;
738 }
739
740 /**
741  * drbd_socket_okay() - Free the socket if its connection is not okay
742  * @mdev:       DRBD device.
743  * @sock:       pointer to the pointer to the socket.
744  */
745 static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
746 {
747         int rr;
748         char tb[4];
749
750         if (!*sock)
751                 return FALSE;
752
753         rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
754
755         if (rr > 0 || rr == -EAGAIN) {
756                 return TRUE;
757         } else {
758                 sock_release(*sock);
759                 *sock = NULL;
760                 return FALSE;
761         }
762 }
763
764 /*
765  * return values:
766  *   1 yes, we have a valid connection
767  *   0 oops, did not work out, please try again
768  *  -1 peer talks different language,
769  *     no point in trying again, please go standalone.
770  *  -2 We do not have a network config...
771  */
772 static int drbd_connect(struct drbd_conf *mdev)
773 {
774         struct socket *s, *sock, *msock;
775         int try, h, ok;
776
777         D_ASSERT(!mdev->data.socket);
778
779         if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
780                 return -2;
781
782         clear_bit(DISCARD_CONCURRENT, &mdev->flags);
783
784         sock  = NULL;
785         msock = NULL;
786
787         do {
788                 for (try = 0;;) {
789                         /* 3 tries, this should take less than a second! */
790                         s = drbd_try_connect(mdev);
791                         if (s || ++try >= 3)
792                                 break;
793                         /* give the other side time to call bind() & listen() */
794                         __set_current_state(TASK_INTERRUPTIBLE);
795                         schedule_timeout(HZ / 10);
796                 }
797
798                 if (s) {
799                         if (!sock) {
800                                 drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
801                                 sock = s;
802                                 s = NULL;
803                         } else if (!msock) {
804                                 drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
805                                 msock = s;
806                                 s = NULL;
807                         } else {
808                                 dev_err(DEV, "Logic error in drbd_connect()\n");
809                                 goto out_release_sockets;
810                         }
811                 }
812
813                 if (sock && msock) {
814                         __set_current_state(TASK_INTERRUPTIBLE);
815                         schedule_timeout(HZ / 10);
816                         ok = drbd_socket_okay(mdev, &sock);
817                         ok = drbd_socket_okay(mdev, &msock) && ok;
818                         if (ok)
819                                 break;
820                 }
821
822 retry:
823                 s = drbd_wait_for_connect(mdev);
824                 if (s) {
825                         try = drbd_recv_fp(mdev, s);
826                         drbd_socket_okay(mdev, &sock);
827                         drbd_socket_okay(mdev, &msock);
828                         switch (try) {
829                         case P_HAND_SHAKE_S:
830                                 if (sock) {
831                                         dev_warn(DEV, "initial packet S crossed\n");
832                                         sock_release(sock);
833                                 }
834                                 sock = s;
835                                 break;
836                         case P_HAND_SHAKE_M:
837                                 if (msock) {
838                                         dev_warn(DEV, "initial packet M crossed\n");
839                                         sock_release(msock);
840                                 }
841                                 msock = s;
842                                 set_bit(DISCARD_CONCURRENT, &mdev->flags);
843                                 break;
844                         default:
845                                 dev_warn(DEV, "Error receiving initial packet\n");
846                                 sock_release(s);
847                                 if (random32() & 1)
848                                         goto retry;
849                         }
850                 }
851
852                 if (mdev->state.conn <= C_DISCONNECTING)
853                         goto out_release_sockets;
854                 if (signal_pending(current)) {
855                         flush_signals(current);
856                         smp_rmb();
857                         if (get_t_state(&mdev->receiver) == Exiting)
858                                 goto out_release_sockets;
859                 }
860
861                 if (sock && msock) {
862                         ok = drbd_socket_okay(mdev, &sock);
863                         ok = drbd_socket_okay(mdev, &msock) && ok;
864                         if (ok)
865                                 break;
866                 }
867         } while (1);
868
869         msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
870         sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
871
872         sock->sk->sk_allocation = GFP_NOIO;
873         msock->sk->sk_allocation = GFP_NOIO;
874
875         sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
876         msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
877
878         /* NOT YET ...
879          * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
880          * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
881          * first set it to the P_HAND_SHAKE timeout,
882          * which we set to 4x the configured ping_timeout. */
883         sock->sk->sk_sndtimeo =
884         sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
885
886         msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
887         msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
888
889         /* we don't want delays.
890          * we use TCP_CORK where apropriate, though */
891         drbd_tcp_nodelay(sock);
892         drbd_tcp_nodelay(msock);
893
894         mdev->data.socket = sock;
895         mdev->meta.socket = msock;
896         mdev->last_received = jiffies;
897
898         D_ASSERT(mdev->asender.task == NULL);
899
900         h = drbd_do_handshake(mdev);
901         if (h <= 0)
902                 return h;
903
904         if (mdev->cram_hmac_tfm) {
905                 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
906                 switch (drbd_do_auth(mdev)) {
907                 case -1:
908                         dev_err(DEV, "Authentication of peer failed\n");
909                         return -1;
910                 case 0:
911                         dev_err(DEV, "Authentication of peer failed, trying again.\n");
912                         return 0;
913                 }
914         }
915
916         if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
917                 return 0;
918
919         sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
920         sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
921
922         atomic_set(&mdev->packet_seq, 0);
923         mdev->peer_seq = 0;
924
925         drbd_thread_start(&mdev->asender);
926
927         if (!drbd_send_protocol(mdev))
928                 return -1;
929         drbd_send_sync_param(mdev, &mdev->sync_conf);
930         drbd_send_sizes(mdev, 0, 0);
931         drbd_send_uuids(mdev);
932         drbd_send_state(mdev);
933         clear_bit(USE_DEGR_WFC_T, &mdev->flags);
934         clear_bit(RESIZE_PENDING, &mdev->flags);
935
936         return 1;
937
938 out_release_sockets:
939         if (sock)
940                 sock_release(sock);
941         if (msock)
942                 sock_release(msock);
943         return -1;
944 }
945
946 static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
947 {
948         int r;
949
950         r = drbd_recv(mdev, h, sizeof(*h));
951
952         if (unlikely(r != sizeof(*h))) {
953                 dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
954                 return FALSE;
955         };
956         h->command = be16_to_cpu(h->command);
957         h->length  = be16_to_cpu(h->length);
958         if (unlikely(h->magic != BE_DRBD_MAGIC)) {
959                 dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
960                     (long)be32_to_cpu(h->magic),
961                     h->command, h->length);
962                 return FALSE;
963         }
964         mdev->last_received = jiffies;
965
966         return TRUE;
967 }
968
969 static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch)
970 {
971         int rv;
972
973         if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
974                 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
975                                         NULL, BLKDEV_IFL_WAIT);
976                 if (rv) {
977                         dev_err(DEV, "local disk flush failed with status %d\n", rv);
978                         /* would rather check on EOPNOTSUPP, but that is not reliable.
979                          * don't try again for ANY return value != 0
980                          * if (rv == -EOPNOTSUPP) */
981                         drbd_bump_write_ordering(mdev, WO_drain_io);
982                 }
983                 put_ldev(mdev);
984         }
985
986         return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
987 }
988
989 static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
990 {
991         struct flush_work *fw = (struct flush_work *)w;
992         struct drbd_epoch *epoch = fw->epoch;
993
994         kfree(w);
995
996         if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
997                 drbd_flush_after_epoch(mdev, epoch);
998
999         drbd_may_finish_epoch(mdev, epoch, EV_PUT |
1000                               (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0));
1001
1002         return 1;
1003 }
1004
1005 /**
1006  * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1007  * @mdev:       DRBD device.
1008  * @epoch:      Epoch object.
1009  * @ev:         Epoch event.
1010  */
1011 static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
1012                                                struct drbd_epoch *epoch,
1013                                                enum epoch_event ev)
1014 {
1015         int finish, epoch_size;
1016         struct drbd_epoch *next_epoch;
1017         int schedule_flush = 0;
1018         enum finish_epoch rv = FE_STILL_LIVE;
1019
1020         spin_lock(&mdev->epoch_lock);
1021         do {
1022                 next_epoch = NULL;
1023                 finish = 0;
1024
1025                 epoch_size = atomic_read(&epoch->epoch_size);
1026
1027                 switch (ev & ~EV_CLEANUP) {
1028                 case EV_PUT:
1029                         atomic_dec(&epoch->active);
1030                         break;
1031                 case EV_GOT_BARRIER_NR:
1032                         set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1033
1034                         /* Special case: If we just switched from WO_bio_barrier to
1035                            WO_bdev_flush we should not finish the current epoch */
1036                         if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
1037                             mdev->write_ordering != WO_bio_barrier &&
1038                             epoch == mdev->current_epoch)
1039                                 clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
1040                         break;
1041                 case EV_BARRIER_DONE:
1042                         set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
1043                         break;
1044                 case EV_BECAME_LAST:
1045                         /* nothing to do*/
1046                         break;
1047                 }
1048
1049                 if (epoch_size != 0 &&
1050                     atomic_read(&epoch->active) == 0 &&
1051                     test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) &&
1052                     epoch->list.prev == &mdev->current_epoch->list &&
1053                     !test_bit(DE_IS_FINISHING, &epoch->flags)) {
1054                         /* Nearly all conditions are met to finish that epoch... */
1055                         if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
1056                             mdev->write_ordering == WO_none ||
1057                             (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
1058                             ev & EV_CLEANUP) {
1059                                 finish = 1;
1060                                 set_bit(DE_IS_FINISHING, &epoch->flags);
1061                         } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
1062                                  mdev->write_ordering == WO_bio_barrier) {
1063                                 atomic_inc(&epoch->active);
1064                                 schedule_flush = 1;
1065                         }
1066                 }
1067                 if (finish) {
1068                         if (!(ev & EV_CLEANUP)) {
1069                                 spin_unlock(&mdev->epoch_lock);
1070                                 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1071                                 spin_lock(&mdev->epoch_lock);
1072                         }
1073                         dec_unacked(mdev);
1074
1075                         if (mdev->current_epoch != epoch) {
1076                                 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1077                                 list_del(&epoch->list);
1078                                 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1079                                 mdev->epochs--;
1080                                 kfree(epoch);
1081
1082                                 if (rv == FE_STILL_LIVE)
1083                                         rv = FE_DESTROYED;
1084                         } else {
1085                                 epoch->flags = 0;
1086                                 atomic_set(&epoch->epoch_size, 0);
1087                                 /* atomic_set(&epoch->active, 0); is already zero */
1088                                 if (rv == FE_STILL_LIVE)
1089                                         rv = FE_RECYCLED;
1090                         }
1091                 }
1092
1093                 if (!next_epoch)
1094                         break;
1095
1096                 epoch = next_epoch;
1097         } while (1);
1098
1099         spin_unlock(&mdev->epoch_lock);
1100
1101         if (schedule_flush) {
1102                 struct flush_work *fw;
1103                 fw = kmalloc(sizeof(*fw), GFP_ATOMIC);
1104                 if (fw) {
1105                         fw->w.cb = w_flush;
1106                         fw->epoch = epoch;
1107                         drbd_queue_work(&mdev->data.work, &fw->w);
1108                 } else {
1109                         dev_warn(DEV, "Could not kmalloc a flush_work obj\n");
1110                         set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1111                         /* That is not a recursion, only one level */
1112                         drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE);
1113                         drbd_may_finish_epoch(mdev, epoch, EV_PUT);
1114                 }
1115         }
1116
1117         return rv;
1118 }
1119
1120 /**
1121  * drbd_bump_write_ordering() - Fall back to an other write ordering method
1122  * @mdev:       DRBD device.
1123  * @wo:         Write ordering method to try.
1124  */
1125 void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1126 {
1127         enum write_ordering_e pwo;
1128         static char *write_ordering_str[] = {
1129                 [WO_none] = "none",
1130                 [WO_drain_io] = "drain",
1131                 [WO_bdev_flush] = "flush",
1132                 [WO_bio_barrier] = "barrier",
1133         };
1134
1135         pwo = mdev->write_ordering;
1136         wo = min(pwo, wo);
1137         if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier)
1138                 wo = WO_bdev_flush;
1139         if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1140                 wo = WO_drain_io;
1141         if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1142                 wo = WO_none;
1143         mdev->write_ordering = wo;
1144         if (pwo != mdev->write_ordering || wo == WO_bio_barrier)
1145                 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1146 }
1147
1148 /**
1149  * drbd_submit_ee()
1150  * @mdev:       DRBD device.
1151  * @e:          epoch entry
1152  * @rw:         flag field, see bio->bi_rw
1153  */
1154 /* TODO allocate from our own bio_set. */
1155 int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1156                 const unsigned rw, const int fault_type)
1157 {
1158         struct bio *bios = NULL;
1159         struct bio *bio;
1160         struct page *page = e->pages;
1161         sector_t sector = e->sector;
1162         unsigned ds = e->size;
1163         unsigned n_bios = 0;
1164         unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1165
1166         /* In most cases, we will only need one bio.  But in case the lower
1167          * level restrictions happen to be different at this offset on this
1168          * side than those of the sending peer, we may need to submit the
1169          * request in more than one bio. */
1170 next_bio:
1171         bio = bio_alloc(GFP_NOIO, nr_pages);
1172         if (!bio) {
1173                 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1174                 goto fail;
1175         }
1176         /* > e->sector, unless this is the first bio */
1177         bio->bi_sector = sector;
1178         bio->bi_bdev = mdev->ldev->backing_bdev;
1179         /* we special case some flags in the multi-bio case, see below
1180          * (REQ_UNPLUG, REQ_HARDBARRIER) */
1181         bio->bi_rw = rw;
1182         bio->bi_private = e;
1183         bio->bi_end_io = drbd_endio_sec;
1184
1185         bio->bi_next = bios;
1186         bios = bio;
1187         ++n_bios;
1188
1189         page_chain_for_each(page) {
1190                 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1191                 if (!bio_add_page(bio, page, len, 0)) {
1192                         /* a single page must always be possible! */
1193                         BUG_ON(bio->bi_vcnt == 0);
1194                         goto next_bio;
1195                 }
1196                 ds -= len;
1197                 sector += len >> 9;
1198                 --nr_pages;
1199         }
1200         D_ASSERT(page == NULL);
1201         D_ASSERT(ds == 0);
1202
1203         atomic_set(&e->pending_bios, n_bios);
1204         do {
1205                 bio = bios;
1206                 bios = bios->bi_next;
1207                 bio->bi_next = NULL;
1208
1209                 /* strip off REQ_UNPLUG unless it is the last bio */
1210                 if (bios)
1211                         bio->bi_rw &= ~REQ_UNPLUG;
1212
1213                 drbd_generic_make_request(mdev, fault_type, bio);
1214
1215                 /* strip off REQ_HARDBARRIER,
1216                  * unless it is the first or last bio */
1217                 if (bios && bios->bi_next)
1218                         bios->bi_rw &= ~REQ_HARDBARRIER;
1219         } while (bios);
1220         maybe_kick_lo(mdev);
1221         return 0;
1222
1223 fail:
1224         while (bios) {
1225                 bio = bios;
1226                 bios = bios->bi_next;
1227                 bio_put(bio);
1228         }
1229         return -ENOMEM;
1230 }
1231
1232 /**
1233  * w_e_reissue() - Worker callback; Resubmit a bio, without REQ_HARDBARRIER set
1234  * @mdev:       DRBD device.
1235  * @w:          work object.
1236  * @cancel:     The connection will be closed anyways (unused in this callback)
1237  */
1238 int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
1239 {
1240         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1241         /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
1242            (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
1243            so that we can finish that epoch in drbd_may_finish_epoch().
1244            That is necessary if we already have a long chain of Epochs, before
1245            we realize that REQ_HARDBARRIER is actually not supported */
1246
1247         /* As long as the -ENOTSUPP on the barrier is reported immediately
1248            that will never trigger. If it is reported late, we will just
1249            print that warning and continue correctly for all future requests
1250            with WO_bdev_flush */
1251         if (previous_epoch(mdev, e->epoch))
1252                 dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
1253
1254         /* we still have a local reference,
1255          * get_ldev was done in receive_Data. */
1256
1257         e->w.cb = e_end_block;
1258         if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
1259                 /* drbd_submit_ee fails for one reason only:
1260                  * if was not able to allocate sufficient bios.
1261                  * requeue, try again later. */
1262                 e->w.cb = w_e_reissue;
1263                 drbd_queue_work(&mdev->data.work, &e->w);
1264         }
1265         return 1;
1266 }
1267
1268 static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
1269 {
1270         int rv, issue_flush;
1271         struct p_barrier *p = (struct p_barrier *)h;
1272         struct drbd_epoch *epoch;
1273
1274         ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
1275
1276         rv = drbd_recv(mdev, h->payload, h->length);
1277         ERR_IF(rv != h->length) return FALSE;
1278
1279         inc_unacked(mdev);
1280
1281         if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
1282                 drbd_kick_lo(mdev);
1283
1284         mdev->current_epoch->barrier_nr = p->barrier;
1285         rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1286
1287         /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1288          * the activity log, which means it would not be resynced in case the
1289          * R_PRIMARY crashes now.
1290          * Therefore we must send the barrier_ack after the barrier request was
1291          * completed. */
1292         switch (mdev->write_ordering) {
1293         case WO_bio_barrier:
1294         case WO_none:
1295                 if (rv == FE_RECYCLED)
1296                         return TRUE;
1297                 break;
1298
1299         case WO_bdev_flush:
1300         case WO_drain_io:
1301                 if (rv == FE_STILL_LIVE) {
1302                         set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1303                         drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1304                         rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1305                 }
1306                 if (rv == FE_RECYCLED)
1307                         return TRUE;
1308
1309                 /* The asender will send all the ACKs and barrier ACKs out, since
1310                    all EEs moved from the active_ee to the done_ee. We need to
1311                    provide a new epoch object for the EEs that come in soon */
1312                 break;
1313         }
1314
1315         /* receiver context, in the writeout path of the other node.
1316          * avoid potential distributed deadlock */
1317         epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1318         if (!epoch) {
1319                 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1320                 issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags);
1321                 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
1322                 if (issue_flush) {
1323                         rv = drbd_flush_after_epoch(mdev, mdev->current_epoch);
1324                         if (rv == FE_RECYCLED)
1325                                 return TRUE;
1326                 }
1327
1328                 drbd_wait_ee_list_empty(mdev, &mdev->done_ee);
1329
1330                 return TRUE;
1331         }
1332
1333         epoch->flags = 0;
1334         atomic_set(&epoch->epoch_size, 0);
1335         atomic_set(&epoch->active, 0);
1336
1337         spin_lock(&mdev->epoch_lock);
1338         if (atomic_read(&mdev->current_epoch->epoch_size)) {
1339                 list_add(&epoch->list, &mdev->current_epoch->list);
1340                 mdev->current_epoch = epoch;
1341                 mdev->epochs++;
1342         } else {
1343                 /* The current_epoch got recycled while we allocated this one... */
1344                 kfree(epoch);
1345         }
1346         spin_unlock(&mdev->epoch_lock);
1347
1348         return TRUE;
1349 }
1350
1351 /* used from receive_RSDataReply (recv_resync_read)
1352  * and from receive_Data */
1353 static struct drbd_epoch_entry *
1354 read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1355 {
1356         const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1357         struct drbd_epoch_entry *e;
1358         struct page *page;
1359         int dgs, ds, rr;
1360         void *dig_in = mdev->int_dig_in;
1361         void *dig_vv = mdev->int_dig_vv;
1362         unsigned long *data;
1363
1364         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1365                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1366
1367         if (dgs) {
1368                 rr = drbd_recv(mdev, dig_in, dgs);
1369                 if (rr != dgs) {
1370                         dev_warn(DEV, "short read receiving data digest: read %d expected %d\n",
1371                              rr, dgs);
1372                         return NULL;
1373                 }
1374         }
1375
1376         data_size -= dgs;
1377
1378         ERR_IF(data_size &  0x1ff) return NULL;
1379         ERR_IF(data_size >  DRBD_MAX_SEGMENT_SIZE) return NULL;
1380
1381         /* even though we trust out peer,
1382          * we sometimes have to double check. */
1383         if (sector + (data_size>>9) > capacity) {
1384                 dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
1385                         (unsigned long long)capacity,
1386                         (unsigned long long)sector, data_size);
1387                 return NULL;
1388         }
1389
1390         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1391          * "criss-cross" setup, that might cause write-out on some other DRBD,
1392          * which in turn might block on the other node at this very place.  */
1393         e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1394         if (!e)
1395                 return NULL;
1396
1397         ds = data_size;
1398         page = e->pages;
1399         page_chain_for_each(page) {
1400                 unsigned len = min_t(int, ds, PAGE_SIZE);
1401                 data = kmap(page);
1402                 rr = drbd_recv(mdev, data, len);
1403                 if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
1404                         dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1405                         data[0] = data[0] ^ (unsigned long)-1;
1406                 }
1407                 kunmap(page);
1408                 if (rr != len) {
1409                         drbd_free_ee(mdev, e);
1410                         dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1411                              rr, len);
1412                         return NULL;
1413                 }
1414                 ds -= rr;
1415         }
1416
1417         if (dgs) {
1418                 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
1419                 if (memcmp(dig_in, dig_vv, dgs)) {
1420                         dev_err(DEV, "Digest integrity check FAILED.\n");
1421                         drbd_bcast_ee(mdev, "digest failed",
1422                                         dgs, dig_in, dig_vv, e);
1423                         drbd_free_ee(mdev, e);
1424                         return NULL;
1425                 }
1426         }
1427         mdev->recv_cnt += data_size>>9;
1428         return e;
1429 }
1430
1431 /* drbd_drain_block() just takes a data block
1432  * out of the socket input buffer, and discards it.
1433  */
1434 static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1435 {
1436         struct page *page;
1437         int rr, rv = 1;
1438         void *data;
1439
1440         if (!data_size)
1441                 return TRUE;
1442
1443         page = drbd_pp_alloc(mdev, 1, 1);
1444
1445         data = kmap(page);
1446         while (data_size) {
1447                 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1448                 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1449                         rv = 0;
1450                         dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1451                              rr, min_t(int, data_size, PAGE_SIZE));
1452                         break;
1453                 }
1454                 data_size -= rr;
1455         }
1456         kunmap(page);
1457         drbd_pp_free(mdev, page);
1458         return rv;
1459 }
1460
1461 static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1462                            sector_t sector, int data_size)
1463 {
1464         struct bio_vec *bvec;
1465         struct bio *bio;
1466         int dgs, rr, i, expect;
1467         void *dig_in = mdev->int_dig_in;
1468         void *dig_vv = mdev->int_dig_vv;
1469
1470         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1471                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1472
1473         if (dgs) {
1474                 rr = drbd_recv(mdev, dig_in, dgs);
1475                 if (rr != dgs) {
1476                         dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n",
1477                              rr, dgs);
1478                         return 0;
1479                 }
1480         }
1481
1482         data_size -= dgs;
1483
1484         /* optimistically update recv_cnt.  if receiving fails below,
1485          * we disconnect anyways, and counters will be reset. */
1486         mdev->recv_cnt += data_size>>9;
1487
1488         bio = req->master_bio;
1489         D_ASSERT(sector == bio->bi_sector);
1490
1491         bio_for_each_segment(bvec, bio, i) {
1492                 expect = min_t(int, data_size, bvec->bv_len);
1493                 rr = drbd_recv(mdev,
1494                              kmap(bvec->bv_page)+bvec->bv_offset,
1495                              expect);
1496                 kunmap(bvec->bv_page);
1497                 if (rr != expect) {
1498                         dev_warn(DEV, "short read receiving data reply: "
1499                              "read %d expected %d\n",
1500                              rr, expect);
1501                         return 0;
1502                 }
1503                 data_size -= rr;
1504         }
1505
1506         if (dgs) {
1507                 drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
1508                 if (memcmp(dig_in, dig_vv, dgs)) {
1509                         dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1510                         return 0;
1511                 }
1512         }
1513
1514         D_ASSERT(data_size == 0);
1515         return 1;
1516 }
1517
1518 /* e_end_resync_block() is called via
1519  * drbd_process_done_ee() by asender only */
1520 static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1521 {
1522         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1523         sector_t sector = e->sector;
1524         int ok;
1525
1526         D_ASSERT(hlist_unhashed(&e->colision));
1527
1528         if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1529                 drbd_set_in_sync(mdev, sector, e->size);
1530                 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1531         } else {
1532                 /* Record failure to sync */
1533                 drbd_rs_failed_io(mdev, sector, e->size);
1534
1535                 ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
1536         }
1537         dec_unacked(mdev);
1538
1539         return ok;
1540 }
1541
1542 static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1543 {
1544         struct drbd_epoch_entry *e;
1545
1546         e = read_in_block(mdev, ID_SYNCER, sector, data_size);
1547         if (!e)
1548                 goto fail;
1549
1550         dec_rs_pending(mdev);
1551
1552         inc_unacked(mdev);
1553         /* corresponding dec_unacked() in e_end_resync_block()
1554          * respective _drbd_clear_done_ee */
1555
1556         e->w.cb = e_end_resync_block;
1557
1558         spin_lock_irq(&mdev->req_lock);
1559         list_add(&e->w.list, &mdev->sync_ee);
1560         spin_unlock_irq(&mdev->req_lock);
1561
1562         if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
1563                 return TRUE;
1564
1565         drbd_free_ee(mdev, e);
1566 fail:
1567         put_ldev(mdev);
1568         return FALSE;
1569 }
1570
1571 static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
1572 {
1573         struct drbd_request *req;
1574         sector_t sector;
1575         unsigned int header_size, data_size;
1576         int ok;
1577         struct p_data *p = (struct p_data *)h;
1578
1579         header_size = sizeof(*p) - sizeof(*h);
1580         data_size   = h->length  - header_size;
1581
1582         ERR_IF(data_size == 0) return FALSE;
1583
1584         if (drbd_recv(mdev, h->payload, header_size) != header_size)
1585                 return FALSE;
1586
1587         sector = be64_to_cpu(p->sector);
1588
1589         spin_lock_irq(&mdev->req_lock);
1590         req = _ar_id_to_req(mdev, p->block_id, sector);
1591         spin_unlock_irq(&mdev->req_lock);
1592         if (unlikely(!req)) {
1593                 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
1594                 return FALSE;
1595         }
1596
1597         /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
1598          * special casing it there for the various failure cases.
1599          * still no race with drbd_fail_pending_reads */
1600         ok = recv_dless_read(mdev, req, sector, data_size);
1601
1602         if (ok)
1603                 req_mod(req, data_received);
1604         /* else: nothing. handled from drbd_disconnect...
1605          * I don't think we may complete this just yet
1606          * in case we are "on-disconnect: freeze" */
1607
1608         return ok;
1609 }
1610
1611 static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
1612 {
1613         sector_t sector;
1614         unsigned int header_size, data_size;
1615         int ok;
1616         struct p_data *p = (struct p_data *)h;
1617
1618         header_size = sizeof(*p) - sizeof(*h);
1619         data_size   = h->length  - header_size;
1620
1621         ERR_IF(data_size == 0) return FALSE;
1622
1623         if (drbd_recv(mdev, h->payload, header_size) != header_size)
1624                 return FALSE;
1625
1626         sector = be64_to_cpu(p->sector);
1627         D_ASSERT(p->block_id == ID_SYNCER);
1628
1629         if (get_ldev(mdev)) {
1630                 /* data is submitted to disk within recv_resync_read.
1631                  * corresponding put_ldev done below on error,
1632                  * or in drbd_endio_write_sec. */
1633                 ok = recv_resync_read(mdev, sector, data_size);
1634         } else {
1635                 if (__ratelimit(&drbd_ratelimit_state))
1636                         dev_err(DEV, "Can not write resync data to local disk.\n");
1637
1638                 ok = drbd_drain_block(mdev, data_size);
1639
1640                 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1641         }
1642
1643         atomic_add(data_size >> 9, &mdev->rs_sect_in);
1644
1645         return ok;
1646 }
1647
1648 /* e_end_block() is called via drbd_process_done_ee().
1649  * this means this function only runs in the asender thread
1650  */
1651 static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1652 {
1653         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1654         sector_t sector = e->sector;
1655         struct drbd_epoch *epoch;
1656         int ok = 1, pcmd;
1657
1658         if (e->flags & EE_IS_BARRIER) {
1659                 epoch = previous_epoch(mdev, e->epoch);
1660                 if (epoch)
1661                         drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
1662         }
1663
1664         if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
1665                 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
1666                         pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1667                                 mdev->state.conn <= C_PAUSED_SYNC_T &&
1668                                 e->flags & EE_MAY_SET_IN_SYNC) ?
1669                                 P_RS_WRITE_ACK : P_WRITE_ACK;
1670                         ok &= drbd_send_ack(mdev, pcmd, e);
1671                         if (pcmd == P_RS_WRITE_ACK)
1672                                 drbd_set_in_sync(mdev, sector, e->size);
1673                 } else {
1674                         ok  = drbd_send_ack(mdev, P_NEG_ACK, e);
1675                         /* we expect it to be marked out of sync anyways...
1676                          * maybe assert this?  */
1677                 }
1678                 dec_unacked(mdev);
1679         }
1680         /* we delete from the conflict detection hash _after_ we sent out the
1681          * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
1682         if (mdev->net_conf->two_primaries) {
1683                 spin_lock_irq(&mdev->req_lock);
1684                 D_ASSERT(!hlist_unhashed(&e->colision));
1685                 hlist_del_init(&e->colision);
1686                 spin_unlock_irq(&mdev->req_lock);
1687         } else {
1688                 D_ASSERT(hlist_unhashed(&e->colision));
1689         }
1690
1691         drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1692
1693         return ok;
1694 }
1695
1696 static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1697 {
1698         struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1699         int ok = 1;
1700
1701         D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1702         ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1703
1704         spin_lock_irq(&mdev->req_lock);
1705         D_ASSERT(!hlist_unhashed(&e->colision));
1706         hlist_del_init(&e->colision);
1707         spin_unlock_irq(&mdev->req_lock);
1708
1709         dec_unacked(mdev);
1710
1711         return ok;
1712 }
1713
1714 /* Called from receive_Data.
1715  * Synchronize packets on sock with packets on msock.
1716  *
1717  * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1718  * packet traveling on msock, they are still processed in the order they have
1719  * been sent.
1720  *
1721  * Note: we don't care for Ack packets overtaking P_DATA packets.
1722  *
1723  * In case packet_seq is larger than mdev->peer_seq number, there are
1724  * outstanding packets on the msock. We wait for them to arrive.
1725  * In case we are the logically next packet, we update mdev->peer_seq
1726  * ourselves. Correctly handles 32bit wrap around.
1727  *
1728  * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1729  * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1730  * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1731  * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1732  *
1733  * returns 0 if we may process the packet,
1734  * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1735 static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1736 {
1737         DEFINE_WAIT(wait);
1738         unsigned int p_seq;
1739         long timeout;
1740         int ret = 0;
1741         spin_lock(&mdev->peer_seq_lock);
1742         for (;;) {
1743                 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1744                 if (seq_le(packet_seq, mdev->peer_seq+1))
1745                         break;
1746                 if (signal_pending(current)) {
1747                         ret = -ERESTARTSYS;
1748                         break;
1749                 }
1750                 p_seq = mdev->peer_seq;
1751                 spin_unlock(&mdev->peer_seq_lock);
1752                 timeout = schedule_timeout(30*HZ);
1753                 spin_lock(&mdev->peer_seq_lock);
1754                 if (timeout == 0 && p_seq == mdev->peer_seq) {
1755                         ret = -ETIMEDOUT;
1756                         dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
1757                         break;
1758                 }
1759         }
1760         finish_wait(&mdev->seq_wait, &wait);
1761         if (mdev->peer_seq+1 == packet_seq)
1762                 mdev->peer_seq++;
1763         spin_unlock(&mdev->peer_seq_lock);
1764         return ret;
1765 }
1766
1767 /* mirrored write */
1768 static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
1769 {
1770         sector_t sector;
1771         struct drbd_epoch_entry *e;
1772         struct p_data *p = (struct p_data *)h;
1773         int header_size, data_size;
1774         int rw = WRITE;
1775         u32 dp_flags;
1776
1777         header_size = sizeof(*p) - sizeof(*h);
1778         data_size   = h->length  - header_size;
1779
1780         ERR_IF(data_size == 0) return FALSE;
1781
1782         if (drbd_recv(mdev, h->payload, header_size) != header_size)
1783                 return FALSE;
1784
1785         if (!get_ldev(mdev)) {
1786                 if (__ratelimit(&drbd_ratelimit_state))
1787                         dev_err(DEV, "Can not write mirrored data block "
1788                             "to local disk.\n");
1789                 spin_lock(&mdev->peer_seq_lock);
1790                 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1791                         mdev->peer_seq++;
1792                 spin_unlock(&mdev->peer_seq_lock);
1793
1794                 drbd_send_ack_dp(mdev, P_NEG_ACK, p);
1795                 atomic_inc(&mdev->current_epoch->epoch_size);
1796                 return drbd_drain_block(mdev, data_size);
1797         }
1798
1799         /* get_ldev(mdev) successful.
1800          * Corresponding put_ldev done either below (on various errors),
1801          * or in drbd_endio_write_sec, if we successfully submit the data at
1802          * the end of this function. */
1803
1804         sector = be64_to_cpu(p->sector);
1805         e = read_in_block(mdev, p->block_id, sector, data_size);
1806         if (!e) {
1807                 put_ldev(mdev);
1808                 return FALSE;
1809         }
1810
1811         e->w.cb = e_end_block;
1812
1813         spin_lock(&mdev->epoch_lock);
1814         e->epoch = mdev->current_epoch;
1815         atomic_inc(&e->epoch->epoch_size);
1816         atomic_inc(&e->epoch->active);
1817
1818         if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) {
1819                 struct drbd_epoch *epoch;
1820                 /* Issue a barrier if we start a new epoch, and the previous epoch
1821                    was not a epoch containing a single request which already was
1822                    a Barrier. */
1823                 epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list);
1824                 if (epoch == e->epoch) {
1825                         set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1826                         rw |= REQ_HARDBARRIER;
1827                         e->flags |= EE_IS_BARRIER;
1828                 } else {
1829                         if (atomic_read(&epoch->epoch_size) > 1 ||
1830                             !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
1831                                 set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
1832                                 set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags);
1833                                 rw |= REQ_HARDBARRIER;
1834                                 e->flags |= EE_IS_BARRIER;
1835                         }
1836                 }
1837         }
1838         spin_unlock(&mdev->epoch_lock);
1839
1840         dp_flags = be32_to_cpu(p->dp_flags);
1841         if (dp_flags & DP_HARDBARRIER) {
1842                 dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
1843                 /* rw |= REQ_HARDBARRIER; */
1844         }
1845         if (dp_flags & DP_RW_SYNC)
1846                 rw |= REQ_SYNC | REQ_UNPLUG;
1847         if (dp_flags & DP_MAY_SET_IN_SYNC)
1848                 e->flags |= EE_MAY_SET_IN_SYNC;
1849
1850         /* I'm the receiver, I do hold a net_cnt reference. */
1851         if (!mdev->net_conf->two_primaries) {
1852                 spin_lock_irq(&mdev->req_lock);
1853         } else {
1854                 /* don't get the req_lock yet,
1855                  * we may sleep in drbd_wait_peer_seq */
1856                 const int size = e->size;
1857                 const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1858                 DEFINE_WAIT(wait);
1859                 struct drbd_request *i;
1860                 struct hlist_node *n;
1861                 struct hlist_head *slot;
1862                 int first;
1863
1864                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1865                 BUG_ON(mdev->ee_hash == NULL);
1866                 BUG_ON(mdev->tl_hash == NULL);
1867
1868                 /* conflict detection and handling:
1869                  * 1. wait on the sequence number,
1870                  *    in case this data packet overtook ACK packets.
1871                  * 2. check our hash tables for conflicting requests.
1872                  *    we only need to walk the tl_hash, since an ee can not
1873                  *    have a conflict with an other ee: on the submitting
1874                  *    node, the corresponding req had already been conflicting,
1875                  *    and a conflicting req is never sent.
1876                  *
1877                  * Note: for two_primaries, we are protocol C,
1878                  * so there cannot be any request that is DONE
1879                  * but still on the transfer log.
1880                  *
1881                  * unconditionally add to the ee_hash.
1882                  *
1883                  * if no conflicting request is found:
1884                  *    submit.
1885                  *
1886                  * if any conflicting request is found
1887                  * that has not yet been acked,
1888                  * AND I have the "discard concurrent writes" flag:
1889                  *       queue (via done_ee) the P_DISCARD_ACK; OUT.
1890                  *
1891                  * if any conflicting request is found:
1892                  *       block the receiver, waiting on misc_wait
1893                  *       until no more conflicting requests are there,
1894                  *       or we get interrupted (disconnect).
1895                  *
1896                  *       we do not just write after local io completion of those
1897                  *       requests, but only after req is done completely, i.e.
1898                  *       we wait for the P_DISCARD_ACK to arrive!
1899                  *
1900                  *       then proceed normally, i.e. submit.
1901                  */
1902                 if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
1903                         goto out_interrupted;
1904
1905                 spin_lock_irq(&mdev->req_lock);
1906
1907                 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
1908
1909 #define OVERLAPS overlaps(i->sector, i->size, sector, size)
1910                 slot = tl_hash_slot(mdev, sector);
1911                 first = 1;
1912                 for (;;) {
1913                         int have_unacked = 0;
1914                         int have_conflict = 0;
1915                         prepare_to_wait(&mdev->misc_wait, &wait,
1916                                 TASK_INTERRUPTIBLE);
1917                         hlist_for_each_entry(i, n, slot, colision) {
1918                                 if (OVERLAPS) {
1919                                         /* only ALERT on first iteration,
1920                                          * we may be woken up early... */
1921                                         if (first)
1922                                                 dev_alert(DEV, "%s[%u] Concurrent local write detected!"
1923                                                       " new: %llus +%u; pending: %llus +%u\n",
1924                                                       current->comm, current->pid,
1925                                                       (unsigned long long)sector, size,
1926                                                       (unsigned long long)i->sector, i->size);
1927                                         if (i->rq_state & RQ_NET_PENDING)
1928                                                 ++have_unacked;
1929                                         ++have_conflict;
1930                                 }
1931                         }
1932 #undef OVERLAPS
1933                         if (!have_conflict)
1934                                 break;
1935
1936                         /* Discard Ack only for the _first_ iteration */
1937                         if (first && discard && have_unacked) {
1938                                 dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
1939                                      (unsigned long long)sector);
1940                                 inc_unacked(mdev);
1941                                 e->w.cb = e_send_discard_ack;
1942                                 list_add_tail(&e->w.list, &mdev->done_ee);
1943
1944                                 spin_unlock_irq(&mdev->req_lock);
1945
1946                                 /* we could probably send that P_DISCARD_ACK ourselves,
1947                                  * but I don't like the receiver using the msock */
1948
1949                                 put_ldev(mdev);
1950                                 wake_asender(mdev);
1951                                 finish_wait(&mdev->misc_wait, &wait);
1952                                 return TRUE;
1953                         }
1954
1955                         if (signal_pending(current)) {
1956                                 hlist_del_init(&e->colision);
1957
1958                                 spin_unlock_irq(&mdev->req_lock);
1959
1960                                 finish_wait(&mdev->misc_wait, &wait);
1961                                 goto out_interrupted;
1962                         }
1963
1964                         spin_unlock_irq(&mdev->req_lock);
1965                         if (first) {
1966                                 first = 0;
1967                                 dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
1968                                      "sec=%llus\n", (unsigned long long)sector);
1969                         } else if (discard) {
1970                                 /* we had none on the first iteration.
1971                                  * there must be none now. */
1972                                 D_ASSERT(have_unacked == 0);
1973                         }
1974                         schedule();
1975                         spin_lock_irq(&mdev->req_lock);
1976                 }
1977                 finish_wait(&mdev->misc_wait, &wait);
1978         }
1979
1980         list_add(&e->w.list, &mdev->active_ee);
1981         spin_unlock_irq(&mdev->req_lock);
1982
1983         switch (mdev->net_conf->wire_protocol) {
1984         case DRBD_PROT_C:
1985                 inc_unacked(mdev);
1986                 /* corresponding dec_unacked() in e_end_block()
1987                  * respective _drbd_clear_done_ee */
1988                 break;
1989         case DRBD_PROT_B:
1990                 /* I really don't like it that the receiver thread
1991                  * sends on the msock, but anyways */
1992                 drbd_send_ack(mdev, P_RECV_ACK, e);
1993                 break;
1994         case DRBD_PROT_A:
1995                 /* nothing to do */
1996                 break;
1997         }
1998
1999         if (mdev->state.pdsk == D_DISKLESS) {
2000                 /* In case we have the only disk of the cluster, */
2001                 drbd_set_out_of_sync(mdev, e->sector, e->size);
2002                 e->flags |= EE_CALL_AL_COMPLETE_IO;
2003                 drbd_al_begin_io(mdev, e->sector);
2004         }
2005
2006         if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
2007                 return TRUE;
2008
2009 out_interrupted:
2010         /* yes, the epoch_size now is imbalanced.
2011          * but we drop the connection anyways, so we don't have a chance to
2012          * receive a barrier... atomic_inc(&mdev->epoch_size); */
2013         put_ldev(mdev);
2014         drbd_free_ee(mdev, e);
2015         return FALSE;
2016 }
2017
2018 static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
2019 {
2020         sector_t sector;
2021         const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
2022         struct drbd_epoch_entry *e;
2023         struct digest_info *di = NULL;
2024         int size, digest_size;
2025         unsigned int fault_type;
2026         struct p_block_req *p =
2027                 (struct p_block_req *)h;
2028         const int brps = sizeof(*p)-sizeof(*h);
2029
2030         if (drbd_recv(mdev, h->payload, brps) != brps)
2031                 return FALSE;
2032
2033         sector = be64_to_cpu(p->sector);
2034         size   = be32_to_cpu(p->blksize);
2035
2036         if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
2037                 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2038                                 (unsigned long long)sector, size);
2039                 return FALSE;
2040         }
2041         if (sector + (size>>9) > capacity) {
2042                 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2043                                 (unsigned long long)sector, size);
2044                 return FALSE;
2045         }
2046
2047         if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
2048                 if (__ratelimit(&drbd_ratelimit_state))
2049                         dev_err(DEV, "Can not satisfy peer's read request, "
2050                             "no local data.\n");
2051                 drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
2052                                  P_NEG_RS_DREPLY , p);
2053                 return drbd_drain_block(mdev, h->length - brps);
2054         }
2055
2056         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2057          * "criss-cross" setup, that might cause write-out on some other DRBD,
2058          * which in turn might block on the other node at this very place.  */
2059         e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
2060         if (!e) {
2061                 put_ldev(mdev);
2062                 return FALSE;
2063         }
2064
2065         switch (h->command) {
2066         case P_DATA_REQUEST:
2067                 e->w.cb = w_e_end_data_req;
2068                 fault_type = DRBD_FAULT_DT_RD;
2069                 break;
2070         case P_RS_DATA_REQUEST:
2071                 e->w.cb = w_e_end_rsdata_req;
2072                 fault_type = DRBD_FAULT_RS_RD;
2073                 /* Eventually this should become asynchronously. Currently it
2074                  * blocks the whole receiver just to delay the reading of a
2075                  * resync data block.
2076                  * the drbd_work_queue mechanism is made for this...
2077                  */
2078                 if (!drbd_rs_begin_io(mdev, sector)) {
2079                         /* we have been interrupted,
2080                          * probably connection lost! */
2081                         D_ASSERT(signal_pending(current));
2082                         goto out_free_e;
2083                 }
2084                 break;
2085
2086         case P_OV_REPLY:
2087         case P_CSUM_RS_REQUEST:
2088                 fault_type = DRBD_FAULT_RS_RD;
2089                 digest_size = h->length - brps ;
2090                 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
2091                 if (!di)
2092                         goto out_free_e;
2093
2094                 di->digest_size = digest_size;
2095                 di->digest = (((char *)di)+sizeof(struct digest_info));
2096
2097                 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
2098                         goto out_free_e;
2099
2100                 e->block_id = (u64)(unsigned long)di;
2101                 if (h->command == P_CSUM_RS_REQUEST) {
2102                         D_ASSERT(mdev->agreed_pro_version >= 89);
2103                         e->w.cb = w_e_end_csum_rs_req;
2104                 } else if (h->command == P_OV_REPLY) {
2105                         e->w.cb = w_e_end_ov_reply;
2106                         dec_rs_pending(mdev);
2107                         break;
2108                 }
2109
2110                 if (!drbd_rs_begin_io(mdev, sector)) {
2111                         /* we have been interrupted, probably connection lost! */
2112                         D_ASSERT(signal_pending(current));
2113                         goto out_free_e;
2114                 }
2115                 break;
2116
2117         case P_OV_REQUEST:
2118                 if (mdev->state.conn >= C_CONNECTED &&
2119                     mdev->state.conn != C_VERIFY_T)
2120                         dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
2121                                 drbd_conn_str(mdev->state.conn));
2122                 if (mdev->ov_start_sector == ~(sector_t)0 &&
2123                     mdev->agreed_pro_version >= 90) {
2124                         mdev->ov_start_sector = sector;
2125                         mdev->ov_position = sector;
2126                         mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector);
2127                         dev_info(DEV, "Online Verify start sector: %llu\n",
2128                                         (unsigned long long)sector);
2129                 }
2130                 e->w.cb = w_e_end_ov_req;
2131                 fault_type = DRBD_FAULT_RS_RD;
2132                 /* Eventually this should become asynchronous. Currently it
2133                  * blocks the whole receiver just to delay the reading of a
2134                  * resync data block.
2135                  * the drbd_work_queue mechanism is made for this...
2136                  */
2137                 if (!drbd_rs_begin_io(mdev, sector)) {
2138                         /* we have been interrupted,
2139                          * probably connection lost! */
2140                         D_ASSERT(signal_pending(current));
2141                         goto out_free_e;
2142                 }
2143                 break;
2144
2145
2146         default:
2147                 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2148                     cmdname(h->command));
2149                 fault_type = DRBD_FAULT_MAX;
2150         }
2151
2152         spin_lock_irq(&mdev->req_lock);
2153         list_add(&e->w.list, &mdev->read_ee);
2154         spin_unlock_irq(&mdev->req_lock);
2155
2156         inc_unacked(mdev);
2157
2158         if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
2159                 return TRUE;
2160
2161 out_free_e:
2162         kfree(di);
2163         put_ldev(mdev);
2164         drbd_free_ee(mdev, e);
2165         return FALSE;
2166 }
2167
2168 static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2169 {
2170         int self, peer, rv = -100;
2171         unsigned long ch_self, ch_peer;
2172
2173         self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2174         peer = mdev->p_uuid[UI_BITMAP] & 1;
2175
2176         ch_peer = mdev->p_uuid[UI_SIZE];
2177         ch_self = mdev->comm_bm_set;
2178
2179         switch (mdev->net_conf->after_sb_0p) {
2180         case ASB_CONSENSUS:
2181         case ASB_DISCARD_SECONDARY:
2182         case ASB_CALL_HELPER:
2183                 dev_err(DEV, "Configuration error.\n");
2184                 break;
2185         case ASB_DISCONNECT:
2186                 break;
2187         case ASB_DISCARD_YOUNGER_PRI:
2188                 if (self == 0 && peer == 1) {
2189                         rv = -1;
2190                         break;
2191                 }
2192                 if (self == 1 && peer == 0) {
2193                         rv =  1;
2194                         break;
2195                 }
2196                 /* Else fall through to one of the other strategies... */
2197         case ASB_DISCARD_OLDER_PRI:
2198                 if (self == 0 && peer == 1) {
2199                         rv = 1;
2200                         break;
2201                 }
2202                 if (self == 1 && peer == 0) {
2203                         rv = -1;
2204                         break;
2205                 }
2206                 /* Else fall through to one of the other strategies... */
2207                 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
2208                      "Using discard-least-changes instead\n");
2209         case ASB_DISCARD_ZERO_CHG:
2210                 if (ch_peer == 0 && ch_self == 0) {
2211                         rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2212                                 ? -1 : 1;
2213                         break;
2214                 } else {
2215                         if (ch_peer == 0) { rv =  1; break; }
2216                         if (ch_self == 0) { rv = -1; break; }
2217                 }
2218                 if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
2219                         break;
2220         case ASB_DISCARD_LEAST_CHG:
2221                 if      (ch_self < ch_peer)
2222                         rv = -1;
2223                 else if (ch_self > ch_peer)
2224                         rv =  1;
2225                 else /* ( ch_self == ch_peer ) */
2226                      /* Well, then use something else. */
2227                         rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2228                                 ? -1 : 1;
2229                 break;
2230         case ASB_DISCARD_LOCAL:
2231                 rv = -1;
2232                 break;
2233         case ASB_DISCARD_REMOTE:
2234                 rv =  1;
2235         }
2236
2237         return rv;
2238 }
2239
2240 static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2241 {
2242         int self, peer, hg, rv = -100;
2243
2244         self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2245         peer = mdev->p_uuid[UI_BITMAP] & 1;
2246
2247         switch (mdev->net_conf->after_sb_1p) {
2248         case ASB_DISCARD_YOUNGER_PRI:
2249         case ASB_DISCARD_OLDER_PRI:
2250         case ASB_DISCARD_LEAST_CHG:
2251         case ASB_DISCARD_LOCAL:
2252         case ASB_DISCARD_REMOTE:
2253                 dev_err(DEV, "Configuration error.\n");
2254                 break;
2255         case ASB_DISCONNECT:
2256                 break;
2257         case ASB_CONSENSUS:
2258                 hg = drbd_asb_recover_0p(mdev);
2259                 if (hg == -1 && mdev->state.role == R_SECONDARY)
2260                         rv = hg;
2261                 if (hg == 1  && mdev->state.role == R_PRIMARY)
2262                         rv = hg;
2263                 break;
2264         case ASB_VIOLENTLY:
2265                 rv = drbd_asb_recover_0p(mdev);
2266                 break;
2267         case ASB_DISCARD_SECONDARY:
2268                 return mdev->state.role == R_PRIMARY ? 1 : -1;
2269         case ASB_CALL_HELPER:
2270                 hg = drbd_asb_recover_0p(mdev);
2271                 if (hg == -1 && mdev->state.role == R_PRIMARY) {
2272                         self = drbd_set_role(mdev, R_SECONDARY, 0);
2273                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2274                           * we might be here in C_WF_REPORT_PARAMS which is transient.
2275                           * we do not need to wait for the after state change work either. */
2276                         self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2277                         if (self != SS_SUCCESS) {
2278                                 drbd_khelper(mdev, "pri-lost-after-sb");
2279                         } else {
2280                                 dev_warn(DEV, "Successfully gave up primary role.\n");
2281                                 rv = hg;
2282                         }
2283                 } else
2284                         rv = hg;
2285         }
2286
2287         return rv;
2288 }
2289
2290 static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2291 {
2292         int self, peer, hg, rv = -100;
2293
2294         self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2295         peer = mdev->p_uuid[UI_BITMAP] & 1;
2296
2297         switch (mdev->net_conf->after_sb_2p) {
2298         case ASB_DISCARD_YOUNGER_PRI:
2299         case ASB_DISCARD_OLDER_PRI:
2300         case ASB_DISCARD_LEAST_CHG:
2301         case ASB_DISCARD_LOCAL:
2302         case ASB_DISCARD_REMOTE:
2303         case ASB_CONSENSUS:
2304         case ASB_DISCARD_SECONDARY:
2305                 dev_err(DEV, "Configuration error.\n");
2306                 break;
2307         case ASB_VIOLENTLY:
2308                 rv = drbd_asb_recover_0p(mdev);
2309                 break;
2310         case ASB_DISCONNECT:
2311                 break;
2312         case ASB_CALL_HELPER:
2313                 hg = drbd_asb_recover_0p(mdev);
2314                 if (hg == -1) {
2315                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2316                           * we might be here in C_WF_REPORT_PARAMS which is transient.
2317                           * we do not need to wait for the after state change work either. */
2318                         self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2319                         if (self != SS_SUCCESS) {
2320                                 drbd_khelper(mdev, "pri-lost-after-sb");
2321                         } else {
2322                                 dev_warn(DEV, "Successfully gave up primary role.\n");
2323                                 rv = hg;
2324                         }
2325                 } else
2326                         rv = hg;
2327         }
2328
2329         return rv;
2330 }
2331
2332 static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2333                            u64 bits, u64 flags)
2334 {
2335         if (!uuid) {
2336                 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2337                 return;
2338         }
2339         dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2340              text,
2341              (unsigned long long)uuid[UI_CURRENT],
2342              (unsigned long long)uuid[UI_BITMAP],
2343              (unsigned long long)uuid[UI_HISTORY_START],
2344              (unsigned long long)uuid[UI_HISTORY_END],
2345              (unsigned long long)bits,
2346              (unsigned long long)flags);
2347 }
2348
2349 /*
2350   100   after split brain try auto recover
2351     2   C_SYNC_SOURCE set BitMap
2352     1   C_SYNC_SOURCE use BitMap
2353     0   no Sync
2354    -1   C_SYNC_TARGET use BitMap
2355    -2   C_SYNC_TARGET set BitMap
2356  -100   after split brain, disconnect
2357 -1000   unrelated data
2358  */
2359 static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2360 {
2361         u64 self, peer;
2362         int i, j;
2363
2364         self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2365         peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2366
2367         *rule_nr = 10;
2368         if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2369                 return 0;
2370
2371         *rule_nr = 20;
2372         if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2373              peer != UUID_JUST_CREATED)
2374                 return -2;
2375
2376         *rule_nr = 30;
2377         if (self != UUID_JUST_CREATED &&
2378             (peer == UUID_JUST_CREATED || peer == (u64)0))
2379                 return 2;
2380
2381         if (self == peer) {
2382                 int rct, dc; /* roles at crash time */
2383
2384                 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2385
2386                         if (mdev->agreed_pro_version < 91)
2387                                 return -1001;
2388
2389                         if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2390                             (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2391                                 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2392                                 drbd_uuid_set_bm(mdev, 0UL);
2393
2394                                 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2395                                                mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2396                                 *rule_nr = 34;
2397                         } else {
2398                                 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2399                                 *rule_nr = 36;
2400                         }
2401
2402                         return 1;
2403                 }
2404
2405                 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2406
2407                         if (mdev->agreed_pro_version < 91)
2408                                 return -1001;
2409
2410                         if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2411                             (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2412                                 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2413
2414                                 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2415                                 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2416                                 mdev->p_uuid[UI_BITMAP] = 0UL;
2417
2418                                 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2419                                 *rule_nr = 35;
2420                         } else {
2421                                 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2422                                 *rule_nr = 37;
2423                         }
2424
2425                         return -1;
2426                 }
2427
2428                 /* Common power [off|failure] */
2429                 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2430                         (mdev->p_uuid[UI_FLAGS] & 2);
2431                 /* lowest bit is set when we were primary,
2432                  * next bit (weight 2) is set when peer was primary */
2433                 *rule_nr = 40;
2434
2435                 switch (rct) {
2436                 case 0: /* !self_pri && !peer_pri */ return 0;
2437                 case 1: /*  self_pri && !peer_pri */ return 1;
2438                 case 2: /* !self_pri &&  peer_pri */ return -1;
2439                 case 3: /*  self_pri &&  peer_pri */
2440                         dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
2441                         return dc ? -1 : 1;
2442                 }
2443         }
2444
2445         *rule_nr = 50;
2446         peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2447         if (self == peer)
2448                 return -1;
2449
2450         *rule_nr = 51;
2451         peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2452         if (self == peer) {
2453                 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2454                 peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1);
2455                 if (self == peer) {
2456                         /* The last P_SYNC_UUID did not get though. Undo the last start of
2457                            resync as sync source modifications of the peer's UUIDs. */
2458
2459                         if (mdev->agreed_pro_version < 91)
2460                                 return -1001;
2461
2462                         mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2463                         mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
2464                         return -1;
2465                 }
2466         }
2467
2468         *rule_nr = 60;
2469         self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2470         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2471                 peer = mdev->p_uuid[i] & ~((u64)1);
2472                 if (self == peer)
2473                         return -2;
2474         }
2475
2476         *rule_nr = 70;
2477         self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2478         peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2479         if (self == peer)
2480                 return 1;
2481
2482         *rule_nr = 71;
2483         self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2484         if (self == peer) {
2485                 self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1);
2486                 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2487                 if (self == peer) {
2488                         /* The last P_SYNC_UUID did not get though. Undo the last start of
2489                            resync as sync source modifications of our UUIDs. */
2490
2491                         if (mdev->agreed_pro_version < 91)
2492                                 return -1001;
2493
2494                         _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2495                         _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2496
2497                         dev_info(DEV, "Undid last start of resync:\n");
2498
2499                         drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2500                                        mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2501
2502                         return 1;
2503                 }
2504         }
2505
2506
2507         *rule_nr = 80;
2508         peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2509         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2510                 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2511                 if (self == peer)
2512                         return 2;
2513         }
2514
2515         *rule_nr = 90;
2516         self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2517         peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2518         if (self == peer && self != ((u64)0))
2519                 return 100;
2520
2521         *rule_nr = 100;
2522         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2523                 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2524                 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2525                         peer = mdev->p_uuid[j] & ~((u64)1);
2526                         if (self == peer)
2527                                 return -100;
2528                 }
2529         }
2530
2531         return -1000;
2532 }
2533
2534 /* drbd_sync_handshake() returns the new conn state on success, or
2535    CONN_MASK (-1) on failure.
2536  */
2537 static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2538                                            enum drbd_disk_state peer_disk) __must_hold(local)
2539 {
2540         int hg, rule_nr;
2541         enum drbd_conns rv = C_MASK;
2542         enum drbd_disk_state mydisk;
2543
2544         mydisk = mdev->state.disk;
2545         if (mydisk == D_NEGOTIATING)
2546                 mydisk = mdev->new_state_tmp.disk;
2547
2548         dev_info(DEV, "drbd_sync_handshake:\n");
2549         drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2550         drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2551                        mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2552
2553         hg = drbd_uuid_compare(mdev, &rule_nr);
2554
2555         dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2556
2557         if (hg == -1000) {
2558                 dev_alert(DEV, "Unrelated data, aborting!\n");
2559                 return C_MASK;
2560         }
2561         if (hg == -1001) {
2562                 dev_alert(DEV, "To resolve this both sides have to support at least protocol\n");
2563                 return C_MASK;
2564         }
2565
2566         if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2567             (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
2568                 int f = (hg == -100) || abs(hg) == 2;
2569                 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2570                 if (f)
2571                         hg = hg*2;
2572                 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2573                      hg > 0 ? "source" : "target");
2574         }
2575
2576         if (abs(hg) == 100)
2577                 drbd_khelper(mdev, "initial-split-brain");
2578
2579         if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2580                 int pcount = (mdev->state.role == R_PRIMARY)
2581                            + (peer_role == R_PRIMARY);
2582                 int forced = (hg == -100);
2583
2584                 switch (pcount) {
2585                 case 0:
2586                         hg = drbd_asb_recover_0p(mdev);
2587                         break;
2588                 case 1:
2589                         hg = drbd_asb_recover_1p(mdev);
2590                         break;
2591                 case 2:
2592                         hg = drbd_asb_recover_2p(mdev);
2593                         break;
2594                 }
2595                 if (abs(hg) < 100) {
2596                         dev_warn(DEV, "Split-Brain detected, %d primaries, "
2597                              "automatically solved. Sync from %s node\n",
2598                              pcount, (hg < 0) ? "peer" : "this");
2599                         if (forced) {
2600                                 dev_warn(DEV, "Doing a full sync, since"
2601                                      " UUIDs where ambiguous.\n");
2602                                 hg = hg*2;
2603                         }
2604                 }
2605         }
2606
2607         if (hg == -100) {
2608                 if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
2609                         hg = -1;
2610                 if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
2611                         hg = 1;
2612
2613                 if (abs(hg) < 100)
2614                         dev_warn(DEV, "Split-Brain detected, manually solved. "
2615                              "Sync from %s node\n",
2616                              (hg < 0) ? "peer" : "this");
2617         }
2618
2619         if (hg == -100) {
2620                 /* FIXME this log message is not correct if we end up here
2621                  * after an attempted attach on a diskless node.
2622                  * We just refuse to attach -- well, we drop the "connection"
2623                  * to that disk, in a way... */
2624                 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
2625                 drbd_khelper(mdev, "split-brain");
2626                 return C_MASK;
2627         }
2628
2629         if (hg > 0 && mydisk <= D_INCONSISTENT) {
2630                 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2631                 return C_MASK;
2632         }
2633
2634         if (hg < 0 && /* by intention we do not use mydisk here. */
2635             mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
2636                 switch (mdev->net_conf->rr_conflict) {
2637                 case ASB_CALL_HELPER:
2638                         drbd_khelper(mdev, "pri-lost");
2639                         /* fall through */
2640                 case ASB_DISCONNECT:
2641                         dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2642                         return C_MASK;
2643                 case ASB_VIOLENTLY:
2644                         dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2645                              "assumption\n");
2646                 }
2647         }
2648
2649         if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) {
2650                 if (hg == 0)
2651                         dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
2652                 else
2653                         dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
2654                                  drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
2655                                  abs(hg) >= 2 ? "full" : "bit-map based");
2656                 return C_MASK;
2657         }
2658
2659         if (abs(hg) >= 2) {
2660                 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
2661                 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake"))
2662                         return C_MASK;
2663         }
2664
2665         if (hg > 0) { /* become sync source. */
2666                 rv = C_WF_BITMAP_S;
2667         } else if (hg < 0) { /* become sync target */
2668                 rv = C_WF_BITMAP_T;
2669         } else {
2670                 rv = C_CONNECTED;
2671                 if (drbd_bm_total_weight(mdev)) {
2672                         dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2673                              drbd_bm_total_weight(mdev));
2674                 }
2675         }
2676
2677         return rv;
2678 }
2679
2680 /* returns 1 if invalid */
2681 static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2682 {
2683         /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2684         if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2685             (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2686                 return 0;
2687
2688         /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2689         if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2690             self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2691                 return 1;
2692
2693         /* everything else is valid if they are equal on both sides. */
2694         if (peer == self)
2695                 return 0;
2696
2697         /* everything es is invalid. */
2698         return 1;
2699 }
2700
2701 static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
2702 {
2703         struct p_protocol *p = (struct p_protocol *)h;
2704         int header_size, data_size;
2705         int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
2706         int p_want_lose, p_two_primaries, cf;
2707         char p_integrity_alg[SHARED_SECRET_MAX] = "";
2708
2709         header_size = sizeof(*p) - sizeof(*h);
2710         data_size   = h->length  - header_size;
2711
2712         if (drbd_recv(mdev, h->payload, header_size) != header_size)
2713                 return FALSE;
2714
2715         p_proto         = be32_to_cpu(p->protocol);
2716         p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
2717         p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
2718         p_after_sb_2p   = be32_to_cpu(p->after_sb_2p);
2719         p_two_primaries = be32_to_cpu(p->two_primaries);
2720         cf              = be32_to_cpu(p->conn_flags);
2721         p_want_lose = cf & CF_WANT_LOSE;
2722
2723         clear_bit(CONN_DRY_RUN, &mdev->flags);
2724
2725         if (cf & CF_DRY_RUN)
2726                 set_bit(CONN_DRY_RUN, &mdev->flags);
2727
2728         if (p_proto != mdev->net_conf->wire_protocol) {
2729                 dev_err(DEV, "incompatible communication protocols\n");
2730                 goto disconnect;
2731         }
2732
2733         if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
2734                 dev_err(DEV, "incompatible after-sb-0pri settings\n");
2735                 goto disconnect;
2736         }
2737
2738         if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
2739                 dev_err(DEV, "incompatible after-sb-1pri settings\n");
2740                 goto disconnect;
2741         }
2742
2743         if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
2744                 dev_err(DEV, "incompatible after-sb-2pri settings\n");
2745                 goto disconnect;
2746         }
2747
2748         if (p_want_lose && mdev->net_conf->want_lose) {
2749                 dev_err(DEV, "both sides have the 'want_lose' flag set\n");
2750                 goto disconnect;
2751         }
2752
2753         if (p_two_primaries != mdev->net_conf->two_primaries) {
2754                 dev_err(DEV, "incompatible setting of the two-primaries options\n");
2755                 goto disconnect;
2756         }
2757
2758         if (mdev->agreed_pro_version >= 87) {
2759                 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2760
2761                 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
2762                         return FALSE;
2763
2764                 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2765                 if (strcmp(p_integrity_alg, my_alg)) {
2766                         dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
2767                         goto disconnect;
2768                 }
2769                 dev_info(DEV, "data-integrity-alg: %s\n",
2770                      my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2771         }
2772
2773         return TRUE;
2774
2775 disconnect:
2776         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2777         return FALSE;
2778 }
2779
2780 /* helper function
2781  * input: alg name, feature name
2782  * return: NULL (alg name was "")
2783  *         ERR_PTR(error) if something goes wrong
2784  *         or the crypto hash ptr, if it worked out ok. */
2785 struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2786                 const char *alg, const char *name)
2787 {
2788         struct crypto_hash *tfm;
2789
2790         if (!alg[0])
2791                 return NULL;
2792
2793         tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2794         if (IS_ERR(tfm)) {
2795                 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2796                         alg, name, PTR_ERR(tfm));
2797                 return tfm;
2798         }
2799         if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2800                 crypto_free_hash(tfm);
2801                 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2802                 return ERR_PTR(-EINVAL);
2803         }
2804         return tfm;
2805 }
2806
2807 static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
2808 {
2809         int ok = TRUE;
2810         struct p_rs_param_95 *p = (struct p_rs_param_95 *)h;
2811         unsigned int header_size, data_size, exp_max_sz;
2812         struct crypto_hash *verify_tfm = NULL;
2813         struct crypto_hash *csums_tfm = NULL;
2814         const int apv = mdev->agreed_pro_version;
2815         int *rs_plan_s = NULL;
2816         int fifo_size = 0;
2817
2818         exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
2819                     : apv == 88 ? sizeof(struct p_rs_param)
2820                                         + SHARED_SECRET_MAX
2821                     : apv <= 94 ? sizeof(struct p_rs_param_89)
2822                     : /* apv >= 95 */ sizeof(struct p_rs_param_95);
2823
2824         if (h->length > exp_max_sz) {
2825                 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
2826                     h->length, exp_max_sz);
2827                 return FALSE;
2828         }
2829
2830         if (apv <= 88) {
2831                 header_size = sizeof(struct p_rs_param) - sizeof(*h);
2832                 data_size   = h->length  - header_size;
2833         } else if (apv <= 94) {
2834                 header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
2835                 data_size   = h->length  - header_size;
2836                 D_ASSERT(data_size == 0);
2837         } else {
2838                 header_size = sizeof(struct p_rs_param_95) - sizeof(*h);
2839                 data_size   = h->length  - header_size;
2840                 D_ASSERT(data_size == 0);
2841         }
2842
2843         /* initialize verify_alg and csums_alg */
2844         memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2845
2846         if (drbd_recv(mdev, h->payload, header_size) != header_size)
2847                 return FALSE;
2848
2849         mdev->sync_conf.rate      = be32_to_cpu(p->rate);
2850
2851         if (apv >= 88) {
2852                 if (apv == 88) {
2853                         if (data_size > SHARED_SECRET_MAX) {
2854                                 dev_err(DEV, "verify-alg too long, "
2855                                     "peer wants %u, accepting only %u byte\n",
2856                                                 data_size, SHARED_SECRET_MAX);
2857                                 return FALSE;
2858                         }
2859
2860                         if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
2861                                 return FALSE;
2862
2863                         /* we expect NUL terminated string */
2864                         /* but just in case someone tries to be evil */
2865                         D_ASSERT(p->verify_alg[data_size-1] == 0);
2866                         p->verify_alg[data_size-1] = 0;
2867
2868                 } else /* apv >= 89 */ {
2869                         /* we still expect NUL terminated strings */
2870                         /* but just in case someone tries to be evil */
2871                         D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
2872                         D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
2873                         p->verify_alg[SHARED_SECRET_MAX-1] = 0;
2874                         p->csums_alg[SHARED_SECRET_MAX-1] = 0;
2875                 }
2876
2877                 if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
2878                         if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2879                                 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2880                                     mdev->sync_conf.verify_alg, p->verify_alg);
2881                                 goto disconnect;
2882                         }
2883                         verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
2884                                         p->verify_alg, "verify-alg");
2885                         if (IS_ERR(verify_tfm)) {
2886                                 verify_tfm = NULL;
2887                                 goto disconnect;
2888                         }
2889                 }
2890
2891                 if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
2892                         if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2893                                 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2894                                     mdev->sync_conf.csums_alg, p->csums_alg);
2895                                 goto disconnect;
2896                         }
2897                         csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
2898                                         p->csums_alg, "csums-alg");
2899                         if (IS_ERR(csums_tfm)) {
2900                                 csums_tfm = NULL;
2901                                 goto disconnect;
2902                         }
2903                 }
2904
2905                 if (apv > 94) {
2906                         mdev->sync_conf.rate      = be32_to_cpu(p->rate);
2907                         mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
2908                         mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
2909                         mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
2910                         mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
2911
2912                         fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
2913                         if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
2914                                 rs_plan_s   = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
2915                                 if (!rs_plan_s) {
2916                                         dev_err(DEV, "kmalloc of fifo_buffer failed");
2917                                         goto disconnect;
2918                                 }
2919                         }
2920                 }
2921
2922                 spin_lock(&mdev->peer_seq_lock);
2923                 /* lock against drbd_nl_syncer_conf() */
2924                 if (verify_tfm) {
2925                         strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
2926                         mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
2927                         crypto_free_hash(mdev->verify_tfm);
2928                         mdev->verify_tfm = verify_tfm;
2929                         dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
2930                 }
2931                 if (csums_tfm) {
2932                         strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
2933                         mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
2934                         crypto_free_hash(mdev->csums_tfm);
2935                         mdev->csums_tfm = csums_tfm;
2936                         dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2937                 }
2938                 if (fifo_size != mdev->rs_plan_s.size) {
2939                         kfree(mdev->rs_plan_s.values);
2940                         mdev->rs_plan_s.values = rs_plan_s;
2941                         mdev->rs_plan_s.size   = fifo_size;
2942                         mdev->rs_planed = 0;
2943                 }
2944                 spin_unlock(&mdev->peer_seq_lock);
2945         }
2946
2947         return ok;
2948 disconnect:
2949         /* just for completeness: actually not needed,
2950          * as this is not reached if csums_tfm was ok. */
2951         crypto_free_hash(csums_tfm);
2952         /* but free the verify_tfm again, if csums_tfm did not work out */
2953         crypto_free_hash(verify_tfm);
2954         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2955         return FALSE;
2956 }
2957
2958 static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2959 {
2960         /* sorry, we currently have no working implementation
2961          * of distributed TCQ */
2962 }
2963
2964 /* warn if the arguments differ by more than 12.5% */
2965 static void warn_if_differ_considerably(struct drbd_conf *mdev,
2966         const char *s, sector_t a, sector_t b)
2967 {
2968         sector_t d;
2969         if (a == 0 || b == 0)
2970                 return;
2971         d = (a > b) ? (a - b) : (b - a);
2972         if (d > (a>>3) || d > (b>>3))
2973                 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
2974                      (unsigned long long)a, (unsigned long long)b);
2975 }
2976
2977 static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2978 {
2979         struct p_sizes *p = (struct p_sizes *)h;
2980         enum determine_dev_size dd = unchanged;
2981         unsigned int max_seg_s;
2982         sector_t p_size, p_usize, my_usize;
2983         int ldsc = 0; /* local disk size changed */
2984         enum dds_flags ddsf;
2985
2986         ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
2987         if (drbd_recv(mdev, h->payload, h->length) != h->length)
2988                 return FALSE;
2989
2990         p_size = be64_to_cpu(p->d_size);
2991         p_usize = be64_to_cpu(p->u_size);
2992
2993         if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
2994                 dev_err(DEV, "some backing storage is needed\n");
2995                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2996                 return FALSE;
2997         }
2998
2999         /* just store the peer's disk size for now.
3000          * we still need to figure out whether we accept that. */
3001         mdev->p_size = p_size;
3002
3003 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
3004         if (get_ldev(mdev)) {
3005                 warn_if_differ_considerably(mdev, "lower level device sizes",
3006                            p_size, drbd_get_max_capacity(mdev->ldev));
3007                 warn_if_differ_considerably(mdev, "user requested size",
3008                                             p_usize, mdev->ldev->dc.disk_size);
3009
3010                 /* if this is the first connect, or an otherwise expected
3011                  * param exchange, choose the minimum */
3012                 if (mdev->state.conn == C_WF_REPORT_PARAMS)
3013                         p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
3014                                              p_usize);
3015
3016                 my_usize = mdev->ldev->dc.disk_size;
3017
3018                 if (mdev->ldev->dc.disk_size != p_usize) {
3019                         mdev->ldev->dc.disk_size = p_usize;
3020                         dev_info(DEV, "Peer sets u_size to %lu sectors\n",
3021                              (unsigned long)mdev->ldev->dc.disk_size);
3022                 }
3023
3024                 /* Never shrink a device with usable data during connect.
3025                    But allow online shrinking if we are connected. */
3026                 if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
3027                    drbd_get_capacity(mdev->this_bdev) &&
3028                    mdev->state.disk >= D_OUTDATED &&
3029                    mdev->state.conn < C_CONNECTED) {
3030                         dev_err(DEV, "The peer's disk size is too small!\n");
3031                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3032                         mdev->ldev->dc.disk_size = my_usize;
3033                         put_ldev(mdev);
3034                         return FALSE;
3035                 }
3036                 put_ldev(mdev);
3037         }
3038 #undef min_not_zero
3039
3040         ddsf = be16_to_cpu(p->dds_flags);
3041         if (get_ldev(mdev)) {
3042                 dd = drbd_determin_dev_size(mdev, ddsf);
3043                 put_ldev(mdev);
3044                 if (dd == dev_size_error)
3045                         return FALSE;
3046                 drbd_md_sync(mdev);
3047         } else {
3048                 /* I am diskless, need to accept the peer's size. */
3049                 drbd_set_my_capacity(mdev, p_size);
3050         }
3051
3052         if (get_ldev(mdev)) {
3053                 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3054                         mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3055                         ldsc = 1;
3056                 }
3057
3058                 if (mdev->agreed_pro_version < 94)
3059                         max_seg_s = be32_to_cpu(p->max_segment_size);
3060                 else /* drbd 8.3.8 onwards */
3061                         max_seg_s = DRBD_MAX_SEGMENT_SIZE;
3062
3063                 if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
3064                         drbd_setup_queue_param(mdev, max_seg_s);
3065
3066                 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
3067                 put_ldev(mdev);
3068         }
3069
3070         if (mdev->state.conn > C_WF_REPORT_PARAMS) {
3071                 if (be64_to_cpu(p->c_size) !=
3072                     drbd_get_capacity(mdev->this_bdev) || ldsc) {
3073                         /* we have different sizes, probably peer
3074                          * needs to know my new size... */
3075                         drbd_send_sizes(mdev, 0, ddsf);
3076                 }
3077                 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3078                     (dd == grew && mdev->state.conn == C_CONNECTED)) {
3079                         if (mdev->state.pdsk >= D_INCONSISTENT &&
3080                             mdev->state.disk >= D_INCONSISTENT) {
3081                                 if (ddsf & DDSF_NO_RESYNC)
3082                                         dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3083                                 else
3084                                         resync_after_online_grow(mdev);
3085                         } else
3086                                 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
3087                 }
3088         }
3089
3090         return TRUE;
3091 }
3092
3093 static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
3094 {
3095         struct p_uuids *p = (struct p_uuids *)h;
3096         u64 *p_uuid;
3097         int i;
3098
3099         ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3100         if (drbd_recv(mdev, h->payload, h->length) != h->length)
3101                 return FALSE;
3102
3103         p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3104
3105         for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3106                 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3107
3108         kfree(mdev->p_uuid);
3109         mdev->p_uuid = p_uuid;
3110
3111         if (mdev->state.conn < C_CONNECTED &&
3112             mdev->state.disk < D_INCONSISTENT &&
3113             mdev->state.role == R_PRIMARY &&
3114             (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3115                 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3116                     (unsigned long long)mdev->ed_uuid);
3117                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3118                 return FALSE;
3119         }
3120
3121         if (get_ldev(mdev)) {
3122                 int skip_initial_sync =
3123                         mdev->state.conn == C_CONNECTED &&
3124                         mdev->agreed_pro_version >= 90 &&
3125                         mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3126                         (p_uuid[UI_FLAGS] & 8);
3127                 if (skip_initial_sync) {
3128                         dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3129                         drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
3130                                         "clear_n_write from receive_uuids");
3131                         _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3132                         _drbd_uuid_set(mdev, UI_BITMAP, 0);
3133                         _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3134                                         CS_VERBOSE, NULL);
3135                         drbd_md_sync(mdev);
3136                 }
3137                 put_ldev(mdev);
3138         } else if (mdev->state.disk < D_INCONSISTENT &&
3139                    mdev->state.role == R_PRIMARY) {
3140                 /* I am a diskless primary, the peer just created a new current UUID
3141                    for me. */
3142                 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3143         }
3144
3145         /* Before we test for the disk state, we should wait until an eventually
3146            ongoing cluster wide state change is finished. That is important if
3147            we are primary and are detaching from our disk. We need to see the
3148            new disk state... */
3149         wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
3150         if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
3151                 drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3152
3153         return TRUE;
3154 }
3155
3156 /**
3157  * convert_state() - Converts the peer's view of the cluster state to our point of view
3158  * @ps:         The state as seen by the peer.
3159  */
3160 static union drbd_state convert_state(union drbd_state ps)
3161 {
3162         union drbd_state ms;
3163
3164         static enum drbd_conns c_tab[] = {
3165                 [C_CONNECTED] = C_CONNECTED,
3166
3167                 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3168                 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3169                 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3170                 [C_VERIFY_S]       = C_VERIFY_T,
3171                 [C_MASK]   = C_MASK,
3172         };
3173
3174         ms.i = ps.i;
3175
3176         ms.conn = c_tab[ps.conn];
3177         ms.peer = ps.role;
3178         ms.role = ps.peer;
3179         ms.pdsk = ps.disk;
3180         ms.disk = ps.pdsk;
3181         ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3182
3183         return ms;
3184 }
3185
3186 static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
3187 {
3188         struct p_req_state *p = (struct p_req_state *)h;
3189         union drbd_state mask, val;
3190         int rv;
3191
3192         ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3193         if (drbd_recv(mdev, h->payload, h->length) != h->length)
3194                 return FALSE;
3195
3196         mask.i = be32_to_cpu(p->mask);
3197         val.i = be32_to_cpu(p->val);
3198
3199         if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3200             test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3201                 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
3202                 return TRUE;
3203         }
3204
3205         mask = convert_state(mask);
3206         val = convert_state(val);
3207
3208         rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3209
3210         drbd_send_sr_reply(mdev, rv);
3211         drbd_md_sync(mdev);
3212
3213         return TRUE;
3214 }
3215
3216 static int receive_state(struct drbd_conf *mdev, struct p_header *h)
3217 {
3218         struct p_state *p = (struct p_state *)h;
3219         enum drbd_conns nconn, oconn;
3220         union drbd_state ns, peer_state;
3221         enum drbd_disk_state real_peer_disk;
3222         enum chg_state_flags cs_flags;
3223         int rv;
3224
3225         ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
3226                 return FALSE;
3227
3228         if (drbd_recv(mdev, h->payload, h->length) != h->length)
3229                 return FALSE;
3230
3231         peer_state.i = be32_to_cpu(p->state);
3232
3233         real_peer_disk = peer_state.disk;
3234         if (peer_state.disk == D_NEGOTIATING) {
3235                 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3236                 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3237         }
3238
3239         spin_lock_irq(&mdev->req_lock);
3240  retry:
3241         oconn = nconn = mdev->state.conn;
3242         spin_unlock_irq(&mdev->req_lock);
3243
3244         if (nconn == C_WF_REPORT_PARAMS)
3245                 nconn = C_CONNECTED;
3246
3247         if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3248             get_ldev_if_state(mdev, D_NEGOTIATING)) {
3249                 int cr; /* consider resync */
3250
3251                 /* if we established a new connection */
3252                 cr  = (oconn < C_CONNECTED);
3253                 /* if we had an established connection
3254                  * and one of the nodes newly attaches a disk */
3255                 cr |= (oconn == C_CONNECTED &&
3256                        (peer_state.disk == D_NEGOTIATING ||
3257                         mdev->state.disk == D_NEGOTIATING));
3258                 /* if we have both been inconsistent, and the peer has been
3259                  * forced to be UpToDate with --overwrite-data */
3260                 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3261                 /* if we had been plain connected, and the admin requested to
3262                  * start a sync by "invalidate" or "invalidate-remote" */
3263                 cr |= (oconn == C_CONNECTED &&
3264                                 (peer_state.conn >= C_STARTING_SYNC_S &&
3265                                  peer_state.conn <= C_WF_BITMAP_T));
3266
3267                 if (cr)
3268                         nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
3269
3270                 put_ldev(mdev);
3271                 if (nconn == C_MASK) {
3272                         nconn = C_CONNECTED;
3273                         if (mdev->state.disk == D_NEGOTIATING) {
3274                                 drbd_force_state(mdev, NS(disk, D_DISKLESS));
3275                         } else if (peer_state.disk == D_NEGOTIATING) {
3276                                 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3277                                 peer_state.disk = D_DISKLESS;
3278                                 real_peer_disk = D_DISKLESS;
3279                         } else {
3280                                 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
3281                                         return FALSE;
3282                                 D_ASSERT(oconn == C_WF_REPORT_PARAMS);
3283                                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3284                                 return FALSE;
3285                         }
3286                 }
3287         }
3288
3289         spin_lock_irq(&mdev->req_lock);
3290         if (mdev->state.conn != oconn)
3291                 goto retry;
3292         clear_bit(CONSIDER_RESYNC, &mdev->flags);
3293         ns.i = mdev->state.i;
3294         ns.conn = nconn;
3295         ns.peer = peer_state.role;
3296         ns.pdsk = real_peer_disk;
3297         ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
3298         if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
3299                 ns.disk = mdev->new_state_tmp.disk;
3300         cs_flags = CS_VERBOSE + (oconn < C_CONNECTED && nconn >= C_CONNECTED ? 0 : CS_HARD);
3301         if (ns.pdsk == D_CONSISTENT && ns.susp && nconn == C_CONNECTED && oconn < C_CONNECTED &&
3302             test_bit(NEW_CUR_UUID, &mdev->flags)) {
3303                 /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
3304                    for temporal network outages! */
3305                 spin_unlock_irq(&mdev->req_lock);
3306                 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
3307                 tl_clear(mdev);
3308                 drbd_uuid_new_current(mdev);
3309                 clear_bit(NEW_CUR_UUID, &mdev->flags);
3310                 drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
3311                 return FALSE;
3312         }
3313         rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
3314         ns = mdev->state;
3315         spin_unlock_irq(&mdev->req_lock);
3316
3317         if (rv < SS_SUCCESS) {
3318                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3319                 return FALSE;
3320         }
3321
3322         if (oconn > C_WF_REPORT_PARAMS) {
3323                 if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
3324                     peer_state.disk != D_NEGOTIATING ) {
3325                         /* we want resync, peer has not yet decided to sync... */
3326                         /* Nowadays only used when forcing a node into primary role and
3327                            setting its disk to UpToDate with that */
3328                         drbd_send_uuids(mdev);
3329                         drbd_send_state(mdev);
3330                 }
3331         }
3332
3333         mdev->net_conf->want_lose = 0;
3334
3335         drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3336
3337         return TRUE;
3338 }
3339
3340 static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
3341 {
3342         struct p_rs_uuid *p = (struct p_rs_uuid *)h;
3343
3344         wait_event(mdev->misc_wait,
3345                    mdev->state.conn == C_WF_SYNC_UUID ||
3346                    mdev->state.conn < C_CONNECTED ||
3347                    mdev->state.disk < D_NEGOTIATING);
3348
3349         /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3350
3351         ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
3352         if (drbd_recv(mdev, h->payload, h->length) != h->length)
3353                 return FALSE;
3354
3355         /* Here the _drbd_uuid_ functions are right, current should
3356            _not_ be rotated into the history */
3357         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3358                 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3359                 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3360
3361                 drbd_start_resync(mdev, C_SYNC_TARGET);
3362
3363                 put_ldev(mdev);
3364         } else
3365                 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3366
3367         return TRUE;
3368 }
3369
3370 enum receive_bitmap_ret { OK, DONE, FAILED };
3371
3372 static enum receive_bitmap_ret
3373 receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
3374         unsigned long *buffer, struct bm_xfer_ctx *c)
3375 {
3376         unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3377         unsigned want = num_words * sizeof(long);
3378
3379         if (want != h->length) {
3380                 dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
3381                 return FAILED;
3382         }
3383         if (want == 0)
3384                 return DONE;
3385         if (drbd_recv(mdev, buffer, want) != want)
3386                 return FAILED;
3387
3388         drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3389
3390         c->word_offset += num_words;
3391         c->bit_offset = c->word_offset * BITS_PER_LONG;
3392         if (c->bit_offset > c->bm_bits)
3393                 c->bit_offset = c->bm_bits;
3394
3395         return OK;
3396 }
3397
3398 static enum receive_bitmap_ret
3399 recv_bm_rle_bits(struct drbd_conf *mdev,
3400                 struct p_compressed_bm *p,
3401                 struct bm_xfer_ctx *c)
3402 {
3403         struct bitstream bs;
3404         u64 look_ahead;
3405         u64 rl;
3406         u64 tmp;
3407         unsigned long s = c->bit_offset;
3408         unsigned long e;
3409         int len = p->head.length - (sizeof(*p) - sizeof(p->head));
3410         int toggle = DCBP_get_start(p);
3411         int have;
3412         int bits;
3413
3414         bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
3415
3416         bits = bitstream_get_bits(&bs, &look_ahead, 64);
3417         if (bits < 0)
3418                 return FAILED;
3419
3420         for (have = bits; have > 0; s += rl, toggle = !toggle) {
3421                 bits = vli_decode_bits(&rl, look_ahead);
3422                 if (bits <= 0)
3423                         return FAILED;
3424
3425                 if (toggle) {
3426                         e = s + rl -1;
3427                         if (e >= c->bm_bits) {
3428                                 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
3429                                 return FAILED;
3430                         }
3431                         _drbd_bm_set_bits(mdev, s, e);
3432                 }
3433
3434                 if (have < bits) {
3435                         dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3436                                 have, bits, look_ahead,
3437                                 (unsigned int)(bs.cur.b - p->code),
3438                                 (unsigned int)bs.buf_len);
3439                         return FAILED;
3440                 }
3441                 look_ahead >>= bits;
3442                 have -= bits;
3443
3444                 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3445                 if (bits < 0)
3446                         return FAILED;
3447                 look_ahead |= tmp << have;
3448                 have += bits;
3449         }
3450
3451         c->bit_offset = s;
3452         bm_xfer_ctx_bit_to_word_offset(c);
3453
3454         return (s == c->bm_bits) ? DONE : OK;
3455 }
3456
3457 static enum receive_bitmap_ret
3458 decode_bitmap_c(struct drbd_conf *mdev,
3459                 struct p_compressed_bm *p,
3460                 struct bm_xfer_ctx *c)
3461 {
3462         if (DCBP_get_code(p) == RLE_VLI_Bits)
3463                 return recv_bm_rle_bits(mdev, p, c);
3464
3465         /* other variants had been implemented for evaluation,
3466          * but have been dropped as this one turned out to be "best"
3467          * during all our tests. */
3468
3469         dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3470         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3471         return FAILED;
3472 }
3473
3474 void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3475                 const char *direction, struct bm_xfer_ctx *c)
3476 {
3477         /* what would it take to transfer it "plaintext" */
3478         unsigned plain = sizeof(struct p_header) *
3479                 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3480                 + c->bm_words * sizeof(long);
3481         unsigned total = c->bytes[0] + c->bytes[1];
3482         unsigned r;
3483
3484         /* total can not be zero. but just in case: */
3485         if (total == 0)
3486                 return;
3487
3488         /* don't report if not compressed */
3489         if (total >= plain)
3490                 return;
3491
3492         /* total < plain. check for overflow, still */
3493         r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3494                                     : (1000 * total / plain);
3495
3496         if (r > 1000)
3497                 r = 1000;
3498
3499         r = 1000 - r;
3500         dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3501              "total %u; compression: %u.%u%%\n",
3502                         direction,
3503                         c->bytes[1], c->packets[1],
3504                         c->bytes[0], c->packets[0],
3505                         total, r/10, r % 10);
3506 }
3507
3508 /* Since we are processing the bitfield from lower addresses to higher,
3509    it does not matter if the process it in 32 bit chunks or 64 bit
3510    chunks as long as it is little endian. (Understand it as byte stream,
3511    beginning with the lowest byte...) If we would use big endian
3512    we would need to process it from the highest address to the lowest,
3513    in order to be agnostic to the 32 vs 64 bits issue.
3514
3515    returns 0 on failure, 1 if we successfully received it. */
3516 static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
3517 {
3518         struct bm_xfer_ctx c;
3519         void *buffer;
3520         enum receive_bitmap_ret ret;
3521         int ok = FALSE;
3522
3523         wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3524
3525         drbd_bm_lock(mdev, "receive bitmap");
3526
3527         /* maybe we should use some per thread scratch page,
3528          * and allocate that during initial device creation? */
3529         buffer   = (unsigned long *) __get_free_page(GFP_NOIO);
3530         if (!buffer) {
3531                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
3532                 goto out;
3533         }
3534
3535         c = (struct bm_xfer_ctx) {
3536                 .bm_bits = drbd_bm_bits(mdev),
3537                 .bm_words = drbd_bm_words(mdev),
3538         };
3539
3540         do {
3541                 if (h->command == P_BITMAP) {
3542                         ret = receive_bitmap_plain(mdev, h, buffer, &c);
3543                 } else if (h->command == P_COMPRESSED_BITMAP) {
3544                         /* MAYBE: sanity check that we speak proto >= 90,
3545                          * and the feature is enabled! */
3546                         struct p_compressed_bm *p;
3547
3548                         if (h->length > BM_PACKET_PAYLOAD_BYTES) {
3549                                 dev_err(DEV, "ReportCBitmap packet too large\n");
3550                                 goto out;
3551                         }
3552                         /* use the page buff */
3553                         p = buffer;
3554                         memcpy(p, h, sizeof(*h));
3555                         if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
3556                                 goto out;
3557                         if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
3558                                 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
3559                                 return FAILED;
3560                         }
3561                         ret = decode_bitmap_c(mdev, p, &c);
3562                 } else {
3563                         dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
3564                         goto out;
3565                 }
3566
3567                 c.packets[h->command == P_BITMAP]++;
3568                 c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
3569
3570                 if (ret != OK)
3571                         break;
3572
3573                 if (!drbd_recv_header(mdev, h))
3574                         goto out;
3575         } while (ret == OK);
3576         if (ret == FAILED)
3577                 goto out;
3578
3579         INFO_bm_xfer_stats(mdev, "receive", &c);
3580
3581         if (mdev->state.conn == C_WF_BITMAP_T) {
3582                 ok = !drbd_send_bitmap(mdev);
3583                 if (!ok)
3584                         goto out;
3585                 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
3586                 ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3587                 D_ASSERT(ok == SS_SUCCESS);
3588         } else if (mdev->state.conn != C_WF_BITMAP_S) {
3589                 /* admin may have requested C_DISCONNECTING,
3590                  * other threads may have noticed network errors */
3591                 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3592                     drbd_conn_str(mdev->state.conn));
3593         }
3594
3595         ok = TRUE;
3596  out:
3597         drbd_bm_unlock(mdev);
3598         if (ok && mdev->state.conn == C_WF_BITMAP_S)
3599                 drbd_start_resync(mdev, C_SYNC_SOURCE);
3600         free_page((unsigned long) buffer);
3601         return ok;
3602 }
3603
3604 static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
3605 {
3606         /* TODO zero copy sink :) */
3607         static char sink[128];
3608         int size, want, r;
3609
3610         if (!silent)
3611                 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3612                      h->command, h->length);
3613
3614         size = h->length;
3615         while (size > 0) {
3616                 want = min_t(int, size, sizeof(sink));
3617                 r = drbd_recv(mdev, sink, want);
3618                 ERR_IF(r <= 0) break;
3619                 size -= r;
3620         }
3621         return size == 0;
3622 }
3623
3624 static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
3625 {
3626         return receive_skip_(mdev, h, 0);
3627 }
3628
3629 static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
3630 {
3631         return receive_skip_(mdev, h, 1);
3632 }
3633
3634 static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
3635 {
3636         if (mdev->state.disk >= D_INCONSISTENT)
3637                 drbd_kick_lo(mdev);
3638
3639         /* Make sure we've acked all the TCP data associated
3640          * with the data requests being unplugged */
3641         drbd_tcp_quickack(mdev->data.socket);
3642
3643         return TRUE;
3644 }
3645
3646 typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
3647
3648 static drbd_cmd_handler_f drbd_default_handler[] = {
3649         [P_DATA]            = receive_Data,
3650         [P_DATA_REPLY]      = receive_DataReply,
3651         [P_RS_DATA_REPLY]   = receive_RSDataReply,
3652         [P_BARRIER]         = receive_Barrier,
3653         [P_BITMAP]          = receive_bitmap,
3654         [P_COMPRESSED_BITMAP]    = receive_bitmap,
3655         [P_UNPLUG_REMOTE]   = receive_UnplugRemote,
3656         [P_DATA_REQUEST]    = receive_DataRequest,
3657         [P_RS_DATA_REQUEST] = receive_DataRequest,
3658         [P_SYNC_PARAM]      = receive_SyncParam,
3659         [P_SYNC_PARAM89]           = receive_SyncParam,
3660         [P_PROTOCOL]        = receive_protocol,
3661         [P_UUIDS]           = receive_uuids,
3662         [P_SIZES]           = receive_sizes,
3663         [P_STATE]           = receive_state,
3664         [P_STATE_CHG_REQ]   = receive_req_state,
3665         [P_SYNC_UUID]       = receive_sync_uuid,
3666         [P_OV_REQUEST]      = receive_DataRequest,
3667         [P_OV_REPLY]        = receive_DataRequest,
3668         [P_CSUM_RS_REQUEST]    = receive_DataRequest,
3669         [P_DELAY_PROBE]     = receive_skip_silent,
3670         /* anything missing from this table is in
3671          * the asender_tbl, see get_asender_cmd */
3672         [P_MAX_CMD]         = NULL,
3673 };
3674
3675 static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
3676 static drbd_cmd_handler_f *drbd_opt_cmd_handler;
3677
3678 static void drbdd(struct drbd_conf *mdev)
3679 {
3680         drbd_cmd_handler_f handler;
3681         struct p_header *header = &mdev->data.rbuf.header;
3682
3683         while (get_t_state(&mdev->receiver) == Running) {
3684                 drbd_thread_current_set_cpu(mdev);
3685                 if (!drbd_recv_header(mdev, header)) {
3686                         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3687                         break;
3688                 }
3689
3690                 if (header->command < P_MAX_CMD)
3691                         handler = drbd_cmd_handler[header->command];
3692                 else if (P_MAY_IGNORE < header->command
3693                      && header->command < P_MAX_OPT_CMD)
3694                         handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
3695                 else if (header->command > P_MAX_OPT_CMD)
3696                         handler = receive_skip;
3697                 else
3698                         handler = NULL;
3699
3700                 if (unlikely(!handler)) {
3701                         dev_err(DEV, "unknown packet type %d, l: %d!\n",
3702                             header->command, header->length);
3703                         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3704                         break;
3705                 }
3706                 if (unlikely(!handler(mdev, header))) {
3707                         dev_err(DEV, "error receiving %s, l: %d!\n",
3708                             cmdname(header->command), header->length);
3709                         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
3710                         break;
3711                 }
3712         }
3713 }
3714
3715 void drbd_flush_workqueue(struct drbd_conf *mdev)
3716 {
3717         struct drbd_wq_barrier barr;
3718
3719         barr.w.cb = w_prev_work_done;
3720         init_completion(&barr.done);
3721         drbd_queue_work(&mdev->data.work, &barr.w);
3722         wait_for_completion(&barr.done);
3723 }
3724
3725 void drbd_free_tl_hash(struct drbd_conf *mdev)
3726 {
3727         struct hlist_head *h;
3728
3729         spin_lock_irq(&mdev->req_lock);
3730
3731         if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
3732                 spin_unlock_irq(&mdev->req_lock);
3733                 return;
3734         }
3735         /* paranoia code */
3736         for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3737                 if (h->first)
3738                         dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3739                                 (int)(h - mdev->ee_hash), h->first);
3740         kfree(mdev->ee_hash);
3741         mdev->ee_hash = NULL;
3742         mdev->ee_hash_s = 0;
3743
3744         /* paranoia code */
3745         for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3746                 if (h->first)
3747                         dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3748                                 (int)(h - mdev->tl_hash), h->first);
3749         kfree(mdev->tl_hash);
3750         mdev->tl_hash = NULL;
3751         mdev->tl_hash_s = 0;
3752         spin_unlock_irq(&mdev->req_lock);
3753 }
3754
3755 static void drbd_disconnect(struct drbd_conf *mdev)
3756 {
3757         enum drbd_fencing_p fp;
3758         union drbd_state os, ns;
3759         int rv = SS_UNKNOWN_ERROR;
3760         unsigned int i;
3761
3762         if (mdev->state.conn == C_STANDALONE)
3763                 return;
3764         if (mdev->state.conn >= C_WF_CONNECTION)
3765                 dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n",
3766                                 drbd_conn_str(mdev->state.conn));
3767
3768         /* asender does not clean up anything. it must not interfere, either */
3769         drbd_thread_stop(&mdev->asender);
3770         drbd_free_sock(mdev);
3771
3772         spin_lock_irq(&mdev->req_lock);
3773         _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3774         _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
3775         _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
3776         spin_unlock_irq(&mdev->req_lock);
3777
3778         /* We do not have data structures that would allow us to
3779          * get the rs_pending_cnt down to 0 again.
3780          *  * On C_SYNC_TARGET we do not have any data structures describing
3781          *    the pending RSDataRequest's we have sent.
3782          *  * On C_SYNC_SOURCE there is no data structure that tracks
3783          *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
3784          *  And no, it is not the sum of the reference counts in the
3785          *  resync_LRU. The resync_LRU tracks the whole operation including
3786          *  the disk-IO, while the rs_pending_cnt only tracks the blocks
3787          *  on the fly. */
3788         drbd_rs_cancel_all(mdev);
3789         mdev->rs_total = 0;
3790         mdev->rs_failed = 0;
3791         atomic_set(&mdev->rs_pending_cnt, 0);
3792         wake_up(&mdev->misc_wait);
3793
3794         /* make sure syncer is stopped and w_resume_next_sg queued */
3795         del_timer_sync(&mdev->resync_timer);
3796         set_bit(STOP_SYNC_TIMER, &mdev->flags);
3797         resync_timer_fn((unsigned long)mdev);
3798
3799         /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
3800          * w_make_resync_request etc. which may still be on the worker queue
3801          * to be "canceled" */
3802         drbd_flush_workqueue(mdev);
3803
3804         /* This also does reclaim_net_ee().  If we do this too early, we might
3805          * miss some resync ee and pages.*/
3806         drbd_process_done_ee(mdev);
3807
3808         kfree(mdev->p_uuid);
3809         mdev->p_uuid = NULL;
3810
3811         if (!mdev->state.susp)
3812                 tl_clear(mdev);
3813
3814         dev_info(DEV, "Connection closed\n");
3815
3816         drbd_md_sync(mdev);
3817
3818         fp = FP_DONT_CARE;
3819         if (get_ldev(mdev)) {
3820                 fp = mdev->ldev->dc.fencing;
3821                 put_ldev(mdev);
3822         }
3823
3824         if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
3825                 drbd_try_outdate_peer_async(mdev);
3826
3827         spin_lock_irq(&mdev->req_lock);
3828         os = mdev->state;
3829         if (os.conn >= C_UNCONNECTED) {
3830                 /* Do not restart in case we are C_DISCONNECTING */
3831                 ns = os;
3832                 ns.conn = C_UNCONNECTED;
3833                 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
3834         }
3835         spin_unlock_irq(&mdev->req_lock);
3836
3837         if (os.conn == C_DISCONNECTING) {
3838                 wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
3839
3840                 if (!mdev->state.susp) {
3841                         /* we must not free the tl_hash
3842                          * while application io is still on the fly */
3843                         wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
3844                         drbd_free_tl_hash(mdev);
3845                 }
3846
3847                 crypto_free_hash(mdev->cram_hmac_tfm);
3848                 mdev->cram_hmac_tfm = NULL;
3849
3850                 kfree(mdev->net_conf);
3851                 mdev->net_conf = NULL;
3852                 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3853         }
3854
3855         /* tcp_close and release of sendpage pages can be deferred.  I don't
3856          * want to use SO_LINGER, because apparently it can be deferred for
3857          * more than 20 seconds (longest time I checked).
3858          *
3859          * Actually we don't care for exactly when the network stack does its
3860          * put_page(), but release our reference on these pages right here.
3861          */
3862         i = drbd_release_ee(mdev, &mdev->net_ee);
3863         if (i)
3864                 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
3865         i = atomic_read(&mdev->pp_in_use);
3866         if (i)
3867                 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
3868
3869         D_ASSERT(list_empty(&mdev->read_ee));
3870         D_ASSERT(list_empty(&mdev->active_ee));
3871         D_ASSERT(list_empty(&mdev->sync_ee));
3872         D_ASSERT(list_empty(&mdev->done_ee));
3873
3874         /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
3875         atomic_set(&mdev->current_epoch->epoch_size, 0);
3876         D_ASSERT(list_empty(&mdev->current_epoch->list));
3877 }
3878
3879 /*
3880  * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
3881  * we can agree on is stored in agreed_pro_version.
3882  *
3883  * feature flags and the reserved array should be enough room for future
3884  * enhancements of the handshake protocol, and possible plugins...
3885  *
3886  * for now, they are expected to be zero, but ignored.
3887  */
3888 static int drbd_send_handshake(struct drbd_conf *mdev)
3889 {
3890         /* ASSERT current == mdev->receiver ... */
3891         struct p_handshake *p = &mdev->data.sbuf.handshake;
3892         int ok;
3893
3894         if (mutex_lock_interruptible(&mdev->data.mutex)) {
3895                 dev_err(DEV, "interrupted during initial handshake\n");
3896                 return 0; /* interrupted. not ok. */
3897         }
3898
3899         if (mdev->data.socket == NULL) {
3900                 mutex_unlock(&mdev->data.mutex);
3901                 return 0;
3902         }
3903
3904         memset(p, 0, sizeof(*p));
3905         p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3906         p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3907         ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
3908                              (struct p_header *)p, sizeof(*p), 0 );
3909         mutex_unlock(&mdev->data.mutex);
3910         return ok;
3911 }
3912
3913 /*
3914  * return values:
3915  *   1 yes, we have a valid connection
3916  *   0 oops, did not work out, please try again
3917  *  -1 peer talks different language,
3918  *     no point in trying again, please go standalone.
3919  */
3920 static int drbd_do_handshake(struct drbd_conf *mdev)
3921 {
3922         /* ASSERT current == mdev->receiver ... */
3923         struct p_handshake *p = &mdev->data.rbuf.handshake;
3924         const int expect = sizeof(struct p_handshake)
3925                           -sizeof(struct p_header);
3926         int rv;
3927
3928         rv = drbd_send_handshake(mdev);
3929         if (!rv)
3930                 return 0;
3931
3932         rv = drbd_recv_header(mdev, &p->head);
3933         if (!rv)
3934                 return 0;
3935
3936         if (p->head.command != P_HAND_SHAKE) {
3937                 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
3938                      cmdname(p->head.command), p->head.command);
3939                 return -1;
3940         }
3941
3942         if (p->head.length != expect) {
3943                 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
3944                      expect, p->head.length);
3945                 return -1;
3946         }
3947
3948         rv = drbd_recv(mdev, &p->head.payload, expect);
3949
3950         if (rv != expect) {
3951                 dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv);
3952                 return 0;
3953         }
3954
3955         p->protocol_min = be32_to_cpu(p->protocol_min);
3956         p->protocol_max = be32_to_cpu(p->protocol_max);
3957         if (p->protocol_max == 0)
3958                 p->protocol_max = p->protocol_min;
3959
3960         if (PRO_VERSION_MAX < p->protocol_min ||
3961             PRO_VERSION_MIN > p->protocol_max)
3962                 goto incompat;
3963
3964         mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
3965
3966         dev_info(DEV, "Handshake successful: "
3967              "Agreed network protocol version %d\n", mdev->agreed_pro_version);
3968
3969         return 1;
3970
3971  incompat:
3972         dev_err(DEV, "incompatible DRBD dialects: "
3973             "I support %d-%d, peer supports %d-%d\n",
3974             PRO_VERSION_MIN, PRO_VERSION_MAX,
3975             p->protocol_min, p->protocol_max);
3976         return -1;
3977 }
3978
3979 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
3980 static int drbd_do_auth(struct drbd_conf *mdev)
3981 {
3982         dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
3983         dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
3984         return -1;
3985 }
3986 #else
3987 #define CHALLENGE_LEN 64
3988
3989 /* Return value:
3990         1 - auth succeeded,
3991         0 - failed, try again (network error),
3992         -1 - auth failed, don't try again.
3993 */
3994
3995 static int drbd_do_auth(struct drbd_conf *mdev)
3996 {
3997         char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
3998         struct scatterlist sg;
3999         char *response = NULL;
4000         char *right_response = NULL;
4001         char *peers_ch = NULL;
4002         struct p_header p;
4003         unsigned int key_len = strlen(mdev->net_conf->shared_secret);
4004         unsigned int resp_size;
4005         struct hash_desc desc;
4006         int rv;
4007
4008         desc.tfm = mdev->cram_hmac_tfm;
4009         desc.flags = 0;
4010
4011         rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
4012                                 (u8 *)mdev->net_conf->shared_secret, key_len);
4013         if (rv) {
4014                 dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
4015                 rv = -1;
4016                 goto fail;
4017         }
4018
4019         get_random_bytes(my_challenge, CHALLENGE_LEN);
4020
4021         rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
4022         if (!rv)
4023                 goto fail;
4024
4025         rv = drbd_recv_header(mdev, &p);
4026         if (!rv)
4027                 goto fail;
4028
4029         if (p.command != P_AUTH_CHALLENGE) {
4030                 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
4031                     cmdname(p.command), p.command);
4032                 rv = 0;
4033                 goto fail;
4034         }
4035
4036         if (p.length > CHALLENGE_LEN*2) {
4037                 dev_err(DEV, "expected AuthChallenge payload too big.\n");
4038                 rv = -1;
4039                 goto fail;
4040         }
4041
4042         peers_ch = kmalloc(p.length, GFP_NOIO);
4043         if (peers_ch == NULL) {
4044                 dev_err(DEV, "kmalloc of peers_ch failed\n");
4045                 rv = -1;
4046                 goto fail;
4047         }
4048
4049         rv = drbd_recv(mdev, peers_ch, p.length);
4050
4051         if (rv != p.length) {
4052                 dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
4053                 rv = 0;
4054                 goto fail;
4055         }
4056
4057         resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
4058         response = kmalloc(resp_size, GFP_NOIO);
4059         if (response == NULL) {
4060                 dev_err(DEV, "kmalloc of response failed\n");
4061                 rv = -1;
4062                 goto fail;
4063         }
4064
4065         sg_init_table(&sg, 1);
4066         sg_set_buf(&sg, peers_ch, p.length);
4067
4068         rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4069         if (rv) {
4070                 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
4071                 rv = -1;
4072                 goto fail;
4073         }
4074
4075         rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
4076         if (!rv)
4077                 goto fail;
4078
4079         rv = drbd_recv_header(mdev, &p);
4080         if (!rv)
4081                 goto fail;
4082
4083         if (p.command != P_AUTH_RESPONSE) {
4084                 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
4085                     cmdname(p.command), p.command);
4086                 rv = 0;
4087                 goto fail;
4088         }
4089
4090         if (p.length != resp_size) {
4091                 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
4092                 rv = 0;
4093                 goto fail;
4094         }
4095
4096         rv = drbd_recv(mdev, response , resp_size);
4097
4098         if (rv != resp_size) {
4099                 dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv);
4100                 rv = 0;
4101                 goto fail;
4102         }
4103
4104         right_response = kmalloc(resp_size, GFP_NOIO);
4105         if (right_response == NULL) {
4106                 dev_err(DEV, "kmalloc of right_response failed\n");
4107                 rv = -1;
4108                 goto fail;
4109         }
4110
4111         sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4112
4113         rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4114         if (rv) {
4115                 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
4116                 rv = -1;
4117                 goto fail;
4118         }
4119
4120         rv = !memcmp(response, right_response, resp_size);
4121
4122         if (rv)
4123                 dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
4124                      resp_size, mdev->net_conf->cram_hmac_alg);
4125         else
4126                 rv = -1;
4127
4128  fail:
4129         kfree(peers_ch);
4130         kfree(response);
4131         kfree(right_response);
4132
4133         return rv;
4134 }
4135 #endif
4136
4137 int drbdd_init(struct drbd_thread *thi)
4138 {
4139         struct drbd_conf *mdev = thi->mdev;
4140         unsigned int minor = mdev_to_minor(mdev);
4141         int h;
4142
4143         sprintf(current->comm, "drbd%d_receiver", minor);
4144
4145         dev_info(DEV, "receiver (re)started\n");
4146
4147         do {
4148                 h = drbd_connect(mdev);
4149                 if (h == 0) {
4150                         drbd_disconnect(mdev);
4151                         __set_current_state(TASK_INTERRUPTIBLE);
4152                         schedule_timeout(HZ);
4153                 }
4154                 if (h == -1) {
4155                         dev_warn(DEV, "Discarding network configuration.\n");
4156                         drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4157                 }
4158         } while (h == 0);
4159
4160         if (h > 0) {
4161                 if (get_net_conf(mdev)) {
4162                         drbdd(mdev);
4163                         put_net_conf(mdev);
4164                 }
4165         }
4166
4167         drbd_disconnect(mdev);
4168
4169         dev_info(DEV, "receiver terminated\n");
4170         return 0;
4171 }
4172
4173 /* ********* acknowledge sender ******** */
4174
4175 static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
4176 {
4177         struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4178
4179         int retcode = be32_to_cpu(p->retcode);
4180
4181         if (retcode >= SS_SUCCESS) {
4182                 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4183         } else {
4184                 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4185                 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4186                     drbd_set_st_err_str(retcode), retcode);
4187         }
4188         wake_up(&mdev->state_wait);
4189
4190         return TRUE;
4191 }
4192
4193 static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
4194 {
4195         return drbd_send_ping_ack(mdev);
4196
4197 }
4198
4199 static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
4200 {
4201         /* restore idle timeout */
4202         mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
4203         if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
4204                 wake_up(&mdev->misc_wait);
4205
4206         return TRUE;
4207 }
4208
4209 static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
4210 {
4211         struct p_block_ack *p = (struct p_block_ack *)h;
4212         sector_t sector = be64_to_cpu(p->sector);
4213         int blksize = be32_to_cpu(p->blksize);
4214
4215         D_ASSERT(mdev->agreed_pro_version >= 89);
4216
4217         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4218
4219         drbd_rs_complete_io(mdev, sector);
4220         drbd_set_in_sync(mdev, sector, blksize);
4221         /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4222         mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4223         dec_rs_pending(mdev);
4224         atomic_add(blksize >> 9, &mdev->rs_sect_in);
4225
4226         return TRUE;
4227 }
4228
4229 /* when we receive the ACK for a write request,
4230  * verify that we actually know about it */
4231 static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4232         u64 id, sector_t sector)
4233 {
4234         struct hlist_head *slot = tl_hash_slot(mdev, sector);
4235         struct hlist_node *n;
4236         struct drbd_request *req;
4237
4238         hlist_for_each_entry(req, n, slot, colision) {
4239                 if ((unsigned long)req == (unsigned long)id) {
4240                         if (req->sector != sector) {
4241                                 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
4242                                     "wrong sector (%llus versus %llus)\n", req,
4243                                     (unsigned long long)req->sector,
4244                                     (unsigned long long)sector);
4245                                 break;
4246                         }
4247                         return req;
4248                 }
4249         }
4250         dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n",
4251                 (void *)(unsigned long)id, (unsigned long long)sector);
4252         return NULL;
4253 }
4254
4255 typedef struct drbd_request *(req_validator_fn)
4256         (struct drbd_conf *mdev, u64 id, sector_t sector);
4257
4258 static int validate_req_change_req_state(struct drbd_conf *mdev,
4259         u64 id, sector_t sector, req_validator_fn validator,
4260         const char *func, enum drbd_req_event what)
4261 {
4262         struct drbd_request *req;
4263         struct bio_and_error m;
4264
4265         spin_lock_irq(&mdev->req_lock);
4266         req = validator(mdev, id, sector);
4267         if (unlikely(!req)) {
4268                 spin_unlock_irq(&mdev->req_lock);
4269                 dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func);
4270                 return FALSE;
4271         }
4272         __req_mod(req, what, &m);
4273         spin_unlock_irq(&mdev->req_lock);
4274
4275         if (m.bio)
4276                 complete_master_bio(mdev, &m);
4277         return TRUE;
4278 }
4279
4280 static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
4281 {
4282         struct p_block_ack *p = (struct p_block_ack *)h;
4283         sector_t sector = be64_to_cpu(p->sector);
4284         int blksize = be32_to_cpu(p->blksize);
4285         enum drbd_req_event what;
4286
4287         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4288
4289         if (is_syncer_block_id(p->block_id)) {
4290                 drbd_set_in_sync(mdev, sector, blksize);
4291                 dec_rs_pending(mdev);
4292                 return TRUE;
4293         }
4294         switch (be16_to_cpu(h->command)) {
4295         case P_RS_WRITE_ACK:
4296                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4297                 what = write_acked_by_peer_and_sis;
4298                 break;
4299         case P_WRITE_ACK:
4300                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4301                 what = write_acked_by_peer;
4302                 break;
4303         case P_RECV_ACK:
4304                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
4305                 what = recv_acked_by_peer;
4306                 break;
4307         case P_DISCARD_ACK:
4308                 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4309                 what = conflict_discarded_by_peer;
4310                 break;
4311         default:
4312                 D_ASSERT(0);
4313                 return FALSE;
4314         }
4315
4316         return validate_req_change_req_state(mdev, p->block_id, sector,
4317                 _ack_id_to_req, __func__ , what);
4318 }
4319
4320 static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
4321 {
4322         struct p_block_ack *p = (struct p_block_ack *)h;
4323         sector_t sector = be64_to_cpu(p->sector);
4324
4325         if (__ratelimit(&drbd_ratelimit_state))
4326                 dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n");
4327
4328         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4329
4330         if (is_syncer_block_id(p->block_id)) {
4331                 int size = be32_to_cpu(p->blksize);
4332                 dec_rs_pending(mdev);
4333                 drbd_rs_failed_io(mdev, sector, size);
4334                 return TRUE;
4335         }
4336         return validate_req_change_req_state(mdev, p->block_id, sector,
4337                 _ack_id_to_req, __func__ , neg_acked);
4338 }
4339
4340 static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
4341 {
4342         struct p_block_ack *p = (struct p_block_ack *)h;
4343         sector_t sector = be64_to_cpu(p->sector);
4344
4345         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4346         dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4347             (unsigned long long)sector, be32_to_cpu(p->blksize));
4348
4349         return validate_req_change_req_state(mdev, p->block_id, sector,
4350                 _ar_id_to_req, __func__ , neg_acked);
4351 }
4352
4353 static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
4354 {
4355         sector_t sector;
4356         int size;
4357         struct p_block_ack *p = (struct p_block_ack *)h;
4358
4359         sector = be64_to_cpu(p->sector);
4360         size = be32_to_cpu(p->blksize);
4361
4362         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4363
4364         dec_rs_pending(mdev);
4365
4366         if (get_ldev_if_state(mdev, D_FAILED)) {
4367                 drbd_rs_complete_io(mdev, sector);
4368                 drbd_rs_failed_io(mdev, sector, size);
4369                 put_ldev(mdev);
4370         }
4371
4372         return TRUE;
4373 }
4374
4375 static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
4376 {
4377         struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4378
4379         tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4380
4381         return TRUE;
4382 }
4383
4384 static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
4385 {
4386         struct p_block_ack *p = (struct p_block_ack *)h;
4387         struct drbd_work *w;
4388         sector_t sector;
4389         int size;
4390
4391         sector = be64_to_cpu(p->sector);
4392         size = be32_to_cpu(p->blksize);
4393
4394         update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4395
4396         if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
4397                 drbd_ov_oos_found(mdev, sector, size);
4398         else
4399                 ov_oos_print(mdev);
4400
4401         drbd_rs_complete_io(mdev, sector);
4402         dec_rs_pending(mdev);
4403
4404         if (--mdev->ov_left == 0) {
4405                 w = kmalloc(sizeof(*w), GFP_NOIO);
4406                 if (w) {
4407                         w->cb = w_ov_finished;
4408                         drbd_queue_work_front(&mdev->data.work, w);
4409                 } else {
4410                         dev_err(DEV, "kmalloc(w) failed.");
4411                         ov_oos_print(mdev);
4412                         drbd_resync_finished(mdev);
4413                 }
4414         }
4415         return TRUE;
4416 }
4417
4418 static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
4419 {
4420         /* IGNORE */
4421         return TRUE;
4422 }
4423
4424 struct asender_cmd {
4425         size_t pkt_size;
4426         int (*process)(struct drbd_conf *mdev, struct p_header *h);
4427 };
4428
4429 static struct asender_cmd *get_asender_cmd(int cmd)
4430 {
4431         static struct asender_cmd asender_tbl[] = {
4432                 /* anything missing from this table is in
4433                  * the drbd_cmd_handler (drbd_default_handler) table,
4434                  * see the beginning of drbdd() */
4435         [P_PING]            = { sizeof(struct p_header), got_Ping },
4436         [P_PING_ACK]        = { sizeof(struct p_header), got_PingAck },
4437         [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
4438         [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
4439         [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
4440         [P_DISCARD_ACK]     = { sizeof(struct p_block_ack), got_BlockAck },
4441         [P_NEG_ACK]         = { sizeof(struct p_block_ack), got_NegAck },
4442         [P_NEG_DREPLY]      = { sizeof(struct p_block_ack), got_NegDReply },
4443         [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply},
4444         [P_OV_RESULT]       = { sizeof(struct p_block_ack), got_OVResult },
4445         [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
4446         [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4447         [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
4448         [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe), got_something_to_ignore_m },
4449         [P_MAX_CMD]         = { 0, NULL },
4450         };
4451         if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
4452                 return NULL;
4453         return &asender_tbl[cmd];
4454 }
4455
4456 int drbd_asender(struct drbd_thread *thi)
4457 {
4458         struct drbd_conf *mdev = thi->mdev;
4459         struct p_header *h = &mdev->meta.rbuf.header;
4460         struct asender_cmd *cmd = NULL;
4461
4462         int rv, len;
4463         void *buf    = h;
4464         int received = 0;
4465         int expect   = sizeof(struct p_header);
4466         int empty;
4467
4468         sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4469
4470         current->policy = SCHED_RR;  /* Make this a realtime task! */
4471         current->rt_priority = 2;    /* more important than all other tasks */
4472
4473         while (get_t_state(thi) == Running) {
4474                 drbd_thread_current_set_cpu(mdev);
4475                 if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
4476                         ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4477                         mdev->meta.socket->sk->sk_rcvtimeo =
4478                                 mdev->net_conf->ping_timeo*HZ/10;
4479                 }
4480
4481                 /* conditionally cork;
4482                  * it may hurt latency if we cork without much to send */
4483                 if (!mdev->net_conf->no_cork &&
4484                         3 < atomic_read(&mdev->unacked_cnt))
4485                         drbd_tcp_cork(mdev->meta.socket);
4486                 while (1) {
4487                         clear_bit(SIGNAL_ASENDER, &mdev->flags);
4488                         flush_signals(current);
4489                         if (!drbd_process_done_ee(mdev)) {
4490                                 dev_err(DEV, "process_done_ee() = NOT_OK\n");
4491                                 goto reconnect;
4492                         }
4493                         /* to avoid race with newly queued ACKs */
4494                         set_bit(SIGNAL_ASENDER, &mdev->flags);
4495                         spin_lock_irq(&mdev->req_lock);
4496                         empty = list_empty(&mdev->done_ee);
4497                         spin_unlock_irq(&mdev->req_lock);
4498                         /* new ack may have been queued right here,
4499                          * but then there is also a signal pending,
4500                          * and we start over... */
4501                         if (empty)
4502                                 break;
4503                 }
4504                 /* but unconditionally uncork unless disabled */
4505                 if (!mdev->net_conf->no_cork)
4506                         drbd_tcp_uncork(mdev->meta.socket);
4507
4508                 /* short circuit, recv_msg would return EINTR anyways. */
4509                 if (signal_pending(current))
4510                         continue;
4511
4512                 rv = drbd_recv_short(mdev, mdev->meta.socket,
4513                                      buf, expect-received, 0);
4514                 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4515
4516                 flush_signals(current);
4517
4518                 /* Note:
4519                  * -EINTR        (on meta) we got a signal
4520                  * -EAGAIN       (on meta) rcvtimeo expired
4521                  * -ECONNRESET   other side closed the connection
4522                  * -ERESTARTSYS  (on data) we got a signal
4523                  * rv <  0       other than above: unexpected error!
4524                  * rv == expected: full header or command
4525                  * rv <  expected: "woken" by signal during receive
4526                  * rv == 0       : "connection shut down by peer"
4527                  */
4528                 if (likely(rv > 0)) {
4529                         received += rv;
4530                         buf      += rv;
4531                 } else if (rv == 0) {
4532                         dev_err(DEV, "meta connection shut down by peer.\n");
4533                         goto reconnect;
4534                 } else if (rv == -EAGAIN) {
4535                         if (mdev->meta.socket->sk->sk_rcvtimeo ==
4536                             mdev->net_conf->ping_timeo*HZ/10) {
4537                                 dev_err(DEV, "PingAck did not arrive in time.\n");
4538                                 goto reconnect;
4539                         }
4540                         set_bit(SEND_PING, &mdev->flags);
4541                         continue;
4542                 } else if (rv == -EINTR) {
4543                         continue;
4544                 } else {
4545                         dev_err(DEV, "sock_recvmsg returned %d\n", rv);
4546                         goto reconnect;
4547                 }
4548
4549                 if (received == expect && cmd == NULL) {
4550                         if (unlikely(h->magic != BE_DRBD_MAGIC)) {
4551                                 dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
4552                                     (long)be32_to_cpu(h->magic),
4553                                     h->command, h->length);
4554                                 goto reconnect;
4555                         }
4556                         cmd = get_asender_cmd(be16_to_cpu(h->command));
4557                         len = be16_to_cpu(h->length);
4558                         if (unlikely(cmd == NULL)) {
4559                                 dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
4560                                     (long)be32_to_cpu(h->magic),
4561                                     h->command, h->length);
4562                                 goto disconnect;
4563                         }
4564                         expect = cmd->pkt_size;
4565                         ERR_IF(len != expect-sizeof(struct p_header))
4566                                 goto reconnect;
4567                 }
4568                 if (received == expect) {
4569                         D_ASSERT(cmd != NULL);
4570                         if (!cmd->process(mdev, h))
4571                                 goto reconnect;
4572
4573                         buf      = h;
4574                         received = 0;
4575                         expect   = sizeof(struct p_header);
4576                         cmd      = NULL;
4577                 }
4578         }
4579
4580         if (0) {
4581 reconnect:
4582                 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
4583         }
4584         if (0) {
4585 disconnect:
4586                 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4587         }
4588         clear_bit(SIGNAL_ASENDER, &mdev->flags);
4589
4590         D_ASSERT(mdev->state.conn < C_CONNECTED);
4591         dev_info(DEV, "asender terminated\n");
4592
4593         return 0;
4594 }