From 7e0a0e38fcfea47e74b0ff6da6266f00bcd2af43 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 May 2019 10:49:27 -0400 Subject: SUNRPC: Replace the queue timer with a delayed work function The queue timer function, which walks the RPC queue in order to locate candidates for waking up is one of the current constraints against removing the bh-safe queue spin locks. Replace it with a delayed work queue, so that we can do the actual rpc task wake ups from an ordinary process context. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index d0e451868f02..7d8db5dcac04 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -183,8 +183,9 @@ struct rpc_task_setup { #define RPC_NR_PRIORITY (1 + RPC_PRIORITY_PRIVILEGED - RPC_PRIORITY_LOW) struct rpc_timer { - struct timer_list timer; struct list_head list; + unsigned long expires; + struct delayed_work dwork; }; /* -- cgit v1.2.3 From 4f8943f8088348ec01456b075d44ad19dce3d698 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 May 2019 16:28:29 -0400 Subject: SUNRPC: Replace direct task wakeups from softirq context Replace the direct task wakeups from inside a softirq context with wakeups from a process context. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtsock.h | 5 +++ net/sunrpc/xprtsock.c | 78 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 77 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index b81d0b3e0799..7638dbe7bc50 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -56,6 +56,7 @@ struct sock_xprt { */ unsigned long sock_state; struct delayed_work connect_worker; + struct work_struct error_worker; struct work_struct recv_worker; struct mutex recv_mutex; struct sockaddr_storage srcaddr; @@ -84,6 +85,10 @@ struct sock_xprt { #define XPRT_SOCK_CONNECTING 1U #define XPRT_SOCK_DATA_READY (2) #define XPRT_SOCK_UPD_TIMEOUT (3) +#define XPRT_SOCK_WAKE_ERROR (4) +#define XPRT_SOCK_WAKE_WRITE (5) +#define XPRT_SOCK_WAKE_PENDING (6) +#define XPRT_SOCK_WAKE_DISCONNECT (7) #endif /* __KERNEL__ */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 36652352a38c..92af57019b96 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1211,6 +1211,15 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); + clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state); + clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state); + clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state); +} + +static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) +{ + set_bit(nr, &transport->sock_state); + queue_work(xprtiod_workqueue, &transport->error_worker); } static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) @@ -1231,6 +1240,7 @@ static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) */ static void xs_error_report(struct sock *sk) { + struct sock_xprt *transport; struct rpc_xprt *xprt; int err; @@ -1238,13 +1248,14 @@ static void xs_error_report(struct sock *sk) if (!(xprt = xprt_from_sock(sk))) goto out; + transport = container_of(xprt, struct sock_xprt, xprt); err = -sk->sk_err; if (err == 0) goto out; dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -err); trace_rpc_socket_error(xprt, sk->sk_socket, err); - xprt_wake_pending_tasks(xprt, err); + xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR); out: read_unlock_bh(&sk->sk_callback_lock); } @@ -1507,7 +1518,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; - xprt_wake_pending_tasks(xprt, -EAGAIN); + xs_run_error_worker(transport, XPRT_SOCK_WAKE_PENDING); } spin_unlock(&xprt->transport_lock); break; @@ -1525,7 +1536,7 @@ static void xs_tcp_state_change(struct sock *sk) /* The server initiated a shutdown of the socket */ xprt->connect_cookie++; clear_bit(XPRT_CONNECTED, &xprt->state); - xs_tcp_force_close(xprt); + xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); /* fall through */ case TCP_CLOSING: /* @@ -1547,7 +1558,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt_clear_connecting(xprt); clear_bit(XPRT_CLOSING, &xprt->state); /* Trigger the socket release */ - xs_tcp_force_close(xprt); + xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); } out: read_unlock_bh(&sk->sk_callback_lock); @@ -1556,6 +1567,7 @@ static void xs_tcp_state_change(struct sock *sk) static void xs_write_space(struct sock *sk) { struct socket_wq *wq; + struct sock_xprt *transport; struct rpc_xprt *xprt; if (!sk->sk_socket) @@ -1564,13 +1576,14 @@ static void xs_write_space(struct sock *sk) if (unlikely(!(xprt = xprt_from_sock(sk)))) return; + transport = container_of(xprt, struct sock_xprt, xprt); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) goto out; - if (xprt_write_space(xprt)) - sk->sk_write_pending--; + xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE); + sk->sk_write_pending--; out: rcu_read_unlock(); } @@ -2461,6 +2474,56 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) delay); } +static void xs_wake_disconnect(struct sock_xprt *transport) +{ + if (test_and_clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state)) + xs_tcp_force_close(&transport->xprt); +} + +static void xs_wake_write(struct sock_xprt *transport) +{ + if (test_and_clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state)) + xprt_write_space(&transport->xprt); +} + +static void xs_wake_error(struct sock_xprt *transport) +{ + int sockerr; + int sockerr_len = sizeof(sockerr); + + if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) + return; + mutex_lock(&transport->recv_mutex); + if (transport->sock == NULL) + goto out; + if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state)) + goto out; + if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR, + (char *)&sockerr, &sockerr_len) != 0) + goto out; + if (sockerr < 0) + xprt_wake_pending_tasks(&transport->xprt, sockerr); +out: + mutex_unlock(&transport->recv_mutex); +} + +static void xs_wake_pending(struct sock_xprt *transport) +{ + if (test_and_clear_bit(XPRT_SOCK_WAKE_PENDING, &transport->sock_state)) + xprt_wake_pending_tasks(&transport->xprt, -EAGAIN); +} + +static void xs_error_handle(struct work_struct *work) +{ + struct sock_xprt *transport = container_of(work, + struct sock_xprt, error_worker); + + xs_wake_disconnect(transport); + xs_wake_write(transport); + xs_wake_error(transport); + xs_wake_pending(transport); +} + /** * xs_local_print_stats - display AF_LOCAL socket-specifc stats * @xprt: rpc_xprt struct containing statistics @@ -2873,6 +2936,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) xprt->timeout = &xs_local_default_timeout; INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); + INIT_WORK(&transport->error_worker, xs_error_handle); INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket); switch (sun->sun_family) { @@ -2943,6 +3007,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) xprt->timeout = &xs_udp_default_timeout; INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn); + INIT_WORK(&transport->error_worker, xs_error_handle); INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket); switch (addr->sa_family) { @@ -3024,6 +3089,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) (xprt->timeout->to_retries + 1); INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); + INIT_WORK(&transport->error_worker, xs_error_handle); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); switch (addr->sa_family) { -- cgit v1.2.3 From 21f0ffaff510b0530bfdf77da7133c0b99dee2fe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Apr 2017 10:52:42 -0400 Subject: SUNRPC: Add basic load balancing to the transport switch For now, just count the queue length. It is less accurate than counting number of bytes queued, but easier to implement. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + include/linux/sunrpc/xprtmultipath.h | 2 ++ net/sunrpc/clnt.c | 40 +++++++++++++++++++++++++++++++++--- net/sunrpc/xprtmultipath.c | 20 +++++++++++++++++- 4 files changed, 59 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index a6d9fce7f20e..15322c1d9c8c 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -238,6 +238,7 @@ struct rpc_xprt { /* * Send stuff */ + atomic_long_t queuelen; spinlock_t transport_lock; /* lock transport info */ spinlock_t reserve_lock; /* lock slot table */ spinlock_t queue_lock; /* send/receive queue lock */ diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index af1257c030d2..c6cce3fbf29d 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -15,6 +15,8 @@ struct rpc_xprt_switch { struct kref xps_kref; unsigned int xps_nxprts; + unsigned int xps_nactive; + atomic_long_t xps_queuelen; struct list_head xps_xprt_list; struct net * xps_net; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b03bfa055c08..976eab68bb5d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -968,13 +968,47 @@ out: } EXPORT_SYMBOL_GPL(rpc_bind_new_program); +static struct rpc_xprt * +rpc_task_get_xprt(struct rpc_clnt *clnt) +{ + struct rpc_xprt_switch *xps; + struct rpc_xprt *xprt= xprt_iter_get_next(&clnt->cl_xpi); + + if (!xprt) + return NULL; + rcu_read_lock(); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + atomic_long_inc(&xps->xps_queuelen); + rcu_read_unlock(); + atomic_long_inc(&xprt->queuelen); + + return xprt; +} + +static void +rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + struct rpc_xprt_switch *xps; + + atomic_long_dec(&xprt->queuelen); + rcu_read_lock(); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + atomic_long_dec(&xps->xps_queuelen); + rcu_read_unlock(); + + xprt_put(xprt); +} + void rpc_task_release_transport(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; if (xprt) { task->tk_xprt = NULL; - xprt_put(xprt); + if (task->tk_client) + rpc_task_release_xprt(task->tk_client, xprt); + else + xprt_put(xprt); } } EXPORT_SYMBOL_GPL(rpc_task_release_transport); @@ -983,6 +1017,7 @@ void rpc_task_release_client(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; + rpc_task_release_transport(task); if (clnt != NULL) { /* Remove from client task list */ spin_lock(&clnt->cl_lock); @@ -992,14 +1027,13 @@ void rpc_task_release_client(struct rpc_task *task) rpc_release_client(clnt); } - rpc_task_release_transport(task); } static void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) { if (!task->tk_xprt) - task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi); + task->tk_xprt = rpc_task_get_xprt(clnt); } static diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 8394124126f8..394e427533be 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -36,6 +36,7 @@ static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps, if (xps->xps_nxprts == 0) xps->xps_net = xprt->xprt_net; xps->xps_nxprts++; + xps->xps_nactive++; } /** @@ -62,6 +63,7 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, { if (unlikely(xprt == NULL)) return; + xps->xps_nactive--; xps->xps_nxprts--; if (xps->xps_nxprts == 0) xps->xps_net = NULL; @@ -317,8 +319,24 @@ struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head, static struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi) { - return xprt_iter_next_entry_multiple(xpi, + struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); + struct rpc_xprt *xprt; + unsigned long xprt_queuelen; + unsigned long xps_queuelen; + unsigned long xps_avglen; + + do { + xprt = xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry_roundrobin); + if (xprt == NULL) + break; + xprt_queuelen = atomic_long_read(&xprt->queuelen); + if (xprt_queuelen <= 2) + break; + xps_queuelen = atomic_long_read(&xps->xps_queuelen); + xps_avglen = DIV_ROUND_UP(xps_queuelen, xps->xps_nactive); + } while (xprt_queuelen > xps_avglen); + return xprt; } static -- cgit v1.2.3 From 1c341b777501613aad83f9c233a3fe5701cff083 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 May 2019 08:38:57 -0400 Subject: NFS: Add deferred cache invalidation for close-to-open consistency violations If the client detects that close-to-open cache consistency has been violated, and that the file or directory has been changed on the server, then do a cache invalidation when we're done working with the file. The reason we don't do an immediate cache invalidation is that we want to avoid performance problems due to false positives. Also, note that we cannot guarantee cache consistency in this situation even if we do invalidate the cache. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 ++++ fs/nfs/inode.c | 15 +++++++++++---- include/linux/nfs_fs.h | 2 ++ 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 57b6a45576ad..bd1f9555447b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -80,6 +80,10 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir ctx->dup_cookie = 0; ctx->cred = get_cred(cred); spin_lock(&dir->i_lock); + if (list_empty(&nfsi->open_files) && + (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED; list_add(&ctx->list, &nfsi->open_files); spin_unlock(&dir->i_lock); return ctx; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 53777813ca95..ea52c71534b5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -208,7 +208,7 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) } if (inode->i_mapping->nrpages == 0) - flags &= ~NFS_INO_INVALID_DATA; + flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); nfsi->cache_validity |= flags; if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode); @@ -652,7 +652,8 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) i_size_write(inode, offset); /* Optimisation */ if (offset == 0) - NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA | + NFS_INO_DATA_INVAL_DEFER); NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); @@ -1032,6 +1033,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); + if (list_empty(&nfsi->open_files) && + (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED; list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } @@ -1313,7 +1318,8 @@ int nfs_revalidate_mapping(struct inode *inode, set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA| + NFS_INO_DATA_INVAL_DEFER); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); @@ -1871,7 +1877,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - } + } else if (!have_delegation) + nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; inode_set_iversion_raw(inode, fattr->change_attr); attr_changed = true; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index d363d5765cdf..0a11712a80e3 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -223,6 +223,8 @@ struct nfs4_copy_state { #define NFS_INO_INVALID_MTIME BIT(10) /* cached mtime is invalid */ #define NFS_INO_INVALID_SIZE BIT(11) /* cached size is invalid */ #define NFS_INO_INVALID_OTHER BIT(12) /* other attrs are invalid */ +#define NFS_INO_DATA_INVAL_DEFER \ + BIT(13) /* Deferred cache invalidation */ #define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ | NFS_INO_INVALID_CTIME \ -- cgit v1.2.3 From 612b41f808a98a124b23d72229693c3181733291 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Apr 2017 08:50:51 -0400 Subject: SUNRPC: Allow creation of RPC clients with multiple connections Add an argument to struct rpc_create_args that allows the specification of how many transport connections you want to set up to the server. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 17 ++++++++++++++++- net/sunrpc/xprtmultipath.c | 3 +-- 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 6e8073140a5d..4619098affa3 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -124,6 +124,7 @@ struct rpc_create_args { u32 prognumber; /* overrides program->number */ u32 version; rpc_authflavor_t authflavor; + u32 nconnect; unsigned long flags; char *client_name; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 976eab68bb5d..b6aca8cb5ae6 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -528,6 +528,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) .bc_xprt = args->bc_xprt, }; char servername[48]; + struct rpc_clnt *clnt; + int i; if (args->bc_xprt) { WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC)); @@ -590,7 +592,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) xprt->resvport = 0; - return rpc_create_xprt(args, xprt); + clnt = rpc_create_xprt(args, xprt); + if (IS_ERR(clnt) || args->nconnect <= 1) + return clnt; + + for (i = 0; i < args->nconnect - 1; i++) { + if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0) + break; + } + return clnt; } EXPORT_SYMBOL_GPL(rpc_create); @@ -2730,6 +2740,10 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, return -ENOMEM; data->xps = xprt_switch_get(xps); data->xprt = xprt_get(xprt); + if (rpc_xprt_switch_has_addr(data->xps, (struct sockaddr *)&xprt->addr)) { + rpc_cb_add_xprt_release(data); + goto success; + } task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS, @@ -2737,6 +2751,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); +success: return 1; } EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 394e427533be..9d66ce53355d 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -52,8 +52,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, if (xprt == NULL) return; spin_lock(&xps->xps_lock); - if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) && - !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) + if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); } -- cgit v1.2.3 From 6619079d05404cb32be29af329b87ac3b0ad4f96 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Apr 2017 11:13:40 -0400 Subject: NFSv4: Allow multiple connections to NFSv4.x (x>0) servers If the user specifies the -onconn= mount option, and the transport protocol is TCP, then set up connections to the server. The connections will all go to the same IP address. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 ++ fs/nfs/internal.h | 1 + fs/nfs/nfs4client.c | 11 +++++++++-- include/linux/nfs_fs_sb.h | 1 + 4 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d7e4f0848e28..fa6953e56a71 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -175,6 +175,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_proto = cl_init->proto; + clp->cl_nconnect = cl_init->nconnect; clp->cl_net = get_net(cl_init->net); clp->cl_principal = "*"; @@ -493,6 +494,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, struct rpc_create_args args = { .net = clp->cl_net, .protocol = clp->cl_proto, + .nconnect = clp->cl_nconnect, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, .timeout = cl_init->timeparms, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index bba09dace5d6..4a49dc1495c5 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -82,6 +82,7 @@ struct nfs_client_initdata { struct nfs_subversion *nfs_mod; int proto; u32 minorversion; + unsigned int nconnect; struct net *net; const struct rpc_timeout *timeparms; const struct cred *cred; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 81b9b6d7927a..5c244c440658 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -859,7 +859,8 @@ static int nfs4_set_client(struct nfs_server *server, const size_t addrlen, const char *ip_addr, int proto, const struct rpc_timeout *timeparms, - u32 minorversion, struct net *net) + u32 minorversion, unsigned int nconnect, + struct net *net) { struct nfs_client_initdata cl_init = { .hostname = hostname, @@ -875,6 +876,8 @@ static int nfs4_set_client(struct nfs_server *server, }; struct nfs_client *clp; + if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP) + cl_init.nconnect = nconnect; if (server->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); if (server->options & NFS_OPTION_MIGRATION) @@ -1074,6 +1077,7 @@ static int nfs4_init_server(struct nfs_server *server, data->nfs_server.protocol, &timeparms, data->minorversion, + data->nfs_server.nconnect, data->net); if (error < 0) return error; @@ -1163,6 +1167,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, XPRT_TRANSPORT_RDMA, parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, + parent_client->cl_nconnect, parent_client->cl_net); if (!error) goto init_server; @@ -1176,6 +1181,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, XPRT_TRANSPORT_TCP, parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, + parent_client->cl_nconnect, parent_client->cl_net); if (error < 0) goto error; @@ -1271,7 +1277,8 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); error = nfs4_set_client(server, hostname, sap, salen, buf, clp->cl_proto, clnt->cl_timeout, - clp->cl_minorversion, net); + clp->cl_minorversion, + clp->cl_nconnect, net); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); if (error != 0) { nfs_server_insert_lists(server); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 1e78032a174b..a87fe854f008 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -58,6 +58,7 @@ struct nfs_client { struct nfs_subversion * cl_nfs_mod; /* pointer to nfs version module */ u32 cl_minorversion;/* NFSv4 minorversion */ + unsigned int cl_nconnect; /* Number of connections */ const char * cl_principal; /* used for machine cred */ #if IS_ENABLED(CONFIG_NFS_V4) -- cgit v1.2.3 From 5a0c257f8e0f4c4b3c33dff545317c21a921303e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 30 May 2019 10:41:28 +1000 Subject: NFS: send state management on a single connection. With NFSv4.1, different network connections need to be explicitly bound to a session. During session startup, this is not possible so only a single connection must be used for session startup. So add a task flag to disable the default round-robin choice of connections (when nconnect > 1) and force the use of a single connection. Then use that flag on all requests for session management - for consistence, include NFSv4.0 management (SETCLIENTID) and session destruction Reported-by: Chuck Lever Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 22 +++++++++++++--------- include/linux/sunrpc/sched.h | 1 + net/sunrpc/clnt.c | 24 +++++++++++++++++++++++- 3 files changed, 37 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 26626ea1f197..d115d9973efc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5992,7 +5992,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_message = &msg, .callback_ops = &nfs4_setclientid_ops, .callback_data = &setclientid, - .flags = RPC_TASK_TIMEOUT, + .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN, }; int status; @@ -6058,7 +6058,8 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, dprintk("NFS call setclientid_confirm auth=%s, (client ID %llx)\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_clientid); - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_setclientid_confirm(clp, status); dprintk("NFS reply setclientid_confirm: %d\n", status); return status; @@ -7639,7 +7640,7 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, - &res.seq_res, 0); + &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); dprintk("NFS reply secinfo: %d\n", status); put_cred(cred); @@ -7977,7 +7978,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred, .rpc_client = clp->cl_rpcclient, .callback_ops = &nfs4_exchange_id_call_ops, .rpc_message = &msg, - .flags = RPC_TASK_TIMEOUT, + .flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN, }; struct nfs41_exchange_id_data *calldata; int status; @@ -8202,7 +8203,8 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, }; int status; - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_destroy_clientid(clp, status); if (status) dprintk("NFS: Got error %d from the server %s on " @@ -8481,7 +8483,8 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, nfs4_init_channel_attrs(&args, clp->cl_rpcclient); args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); - status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_create_session(clp, status); switch (status) { @@ -8557,7 +8560,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state)) return 0; - status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); + status = rpc_call_sync(session->clp->cl_rpcclient, &msg, + RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN); trace_nfs4_destroy_session(session->clp, status); if (status) @@ -8811,7 +8815,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_reclaim_complete_call_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN, }; int status = -ENOMEM; @@ -9330,7 +9334,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("--> %s\n", __func__); status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, 0); + &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); dprintk("<-- %s status=%d\n", __func__, status); put_cred(cred); diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index d0e451868f02..11424bdf09e6 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -126,6 +126,7 @@ struct rpc_task_setup { #define RPC_CALL_MAJORSEEN 0x0020 /* major timeout seen */ #define RPC_TASK_ROOTCREDS 0x0040 /* force root creds */ #define RPC_TASK_DYNAMIC 0x0080 /* task was kmalloc'ed */ +#define RPC_TASK_NO_ROUND_ROBIN 0x0100 /* send requests on "main" xprt */ #define RPC_TASK_SOFT 0x0200 /* Use soft timeouts */ #define RPC_TASK_SOFTCONN 0x0400 /* Fail if can't connect */ #define RPC_TASK_SENT 0x0800 /* message was sent */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b6aca8cb5ae6..d599fab8adcb 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -995,6 +995,24 @@ rpc_task_get_xprt(struct rpc_clnt *clnt) return xprt; } +static struct rpc_xprt * +rpc_task_get_first_xprt(struct rpc_clnt *clnt) +{ + struct rpc_xprt_switch *xps; + struct rpc_xprt *xprt; + + rcu_read_lock(); + xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + if (xprt) { + atomic_long_inc(&xprt->queuelen); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + atomic_long_inc(&xps->xps_queuelen); + } + rcu_read_unlock(); + + return xprt; +} + static void rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { @@ -1042,7 +1060,11 @@ void rpc_task_release_client(struct rpc_task *task) static void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) { - if (!task->tk_xprt) + if (task->tk_xprt) + return; + if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) + task->tk_xprt = rpc_task_get_first_xprt(clnt); + else task->tk_xprt = rpc_task_get_xprt(clnt); } -- cgit v1.2.3 From a332518fda4731c07394164b3edcbb6efaf4c4d7 Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Thu, 23 May 2019 16:13:50 -0400 Subject: SUNRPC: Count ops completing with tk_status < 0 We often see various error conditions with NFS4.x that show up with a very high operation count all completing with tk_status < 0 in a short period of time. Add a count to rpc_iostats to record on a per-op basis the ops that complete in this manner, which will enable lower overhead diagnostics. Signed-off-by: Dave Wysochanski Signed-off-by: Trond Myklebust --- include/linux/sunrpc/metrics.h | 7 ++++++- net/sunrpc/stats.c | 8 ++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h index 1b3751327575..0ee3f7052846 100644 --- a/include/linux/sunrpc/metrics.h +++ b/include/linux/sunrpc/metrics.h @@ -30,7 +30,7 @@ #include #include -#define RPC_IOSTATS_VERS "1.0" +#define RPC_IOSTATS_VERS "1.1" struct rpc_iostats { spinlock_t om_lock; @@ -66,6 +66,11 @@ struct rpc_iostats { ktime_t om_queue, /* queued for xmit */ om_rtt, /* RPC RTT */ om_execute; /* RPC execution */ + /* + * The count of operations that complete with tk_status < 0. + * These statuses usually indicate error conditions. + */ + unsigned long om_error_status; } ____cacheline_aligned; struct rpc_task; diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 2f7bde82450b..48ea776364f8 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -177,6 +177,8 @@ void rpc_count_iostats_metrics(const struct rpc_task *task, execute = ktime_sub(now, task->tk_start); op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute); + if (task->tk_status < 0) + op_metrics->om_error_status++; spin_unlock(&op_metrics->om_lock); @@ -219,13 +221,14 @@ static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b) a->om_queue = ktime_add(a->om_queue, b->om_queue); a->om_rtt = ktime_add(a->om_rtt, b->om_rtt); a->om_execute = ktime_add(a->om_execute, b->om_execute); + a->om_error_status += b->om_error_status; } static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats, int op, const struct rpc_procinfo *procs) { _print_name(seq, op, procs); - seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu\n", + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n", stats->om_ops, stats->om_ntrans, stats->om_timeouts, @@ -233,7 +236,8 @@ static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats, stats->om_bytes_recv, ktime_to_ms(stats->om_queue), ktime_to_ms(stats->om_rtt), - ktime_to_ms(stats->om_execute)); + ktime_to_ms(stats->om_execute), + stats->om_error_status); } void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt) -- cgit v1.2.3 From 675dd90ad0932f2c03912a5252458d792bd7033a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 19 Jun 2019 10:33:42 -0400 Subject: xprtrdma: Modernize ops->connect Adapt and apply changes that were made to the TCP socket connect code. See the following commits for details on the purpose of these changes: Commit 7196dbb02ea0 ("SUNRPC: Allow changing of the TCP timeout parameters on the fly") Commit 3851f1cdb2b8 ("SUNRPC: Limit the reconnect backoff timer to the max RPC message timeout") Commit 02910177aede ("SUNRPC: Fix reconnection timeouts") Some common transport code is moved to xprt.c to satisfy the code duplication police. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 3 ++ include/trace/events/rpcrdma.h | 31 +++++++++++++++++++ net/sunrpc/sched.c | 1 + net/sunrpc/xprt.c | 32 ++++++++++++++++++++ net/sunrpc/xprtrdma/transport.c | 66 +++++++++++++++++++++++++++++++---------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + net/sunrpc/xprtsock.c | 23 ++------------ 7 files changed, 121 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index a6d9fce7f20e..cc78fd38ea7d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -334,6 +334,9 @@ struct xprt_class { */ struct rpc_xprt *xprt_create_transport(struct xprt_create *args); void xprt_connect(struct rpc_task *task); +unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt); +void xprt_reconnect_backoff(struct rpc_xprt *xprt, + unsigned long init_to); void xprt_reserve(struct rpc_task *task); void xprt_retry_reserve(struct rpc_task *task); int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task); diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 98023d91a72d..f6a4eaa85a3e 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -375,6 +375,37 @@ DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); DEFINE_RXPRT_EVENT(xprtrdma_op_close); DEFINE_RXPRT_EVENT(xprtrdma_op_connect); +TRACE_EVENT(xprtrdma_op_set_cto, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + unsigned long connect, + unsigned long reconnect + ), + + TP_ARGS(r_xprt, connect, reconnect), + + TP_STRUCT__entry( + __field(const void *, r_xprt) + __field(unsigned long, connect) + __field(unsigned long, reconnect) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + __entry->r_xprt = r_xprt; + __entry->connect = connect; + __entry->reconnect = reconnect; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s r_xprt=%p: connect=%lu reconnect=%lu", + __get_str(addr), __get_str(port), __entry->r_xprt, + __entry->connect / HZ, __entry->reconnect / HZ + ) +); + TRACE_EVENT(xprtrdma_qp_event, TP_PROTO( const struct rpcrdma_xprt *r_xprt, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index bb04ae52803a..5ad5dead7bfc 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -58,6 +58,7 @@ static struct rpc_wait_queue delay_queue; */ struct workqueue_struct *rpciod_workqueue __read_mostly; struct workqueue_struct *xprtiod_workqueue __read_mostly; +EXPORT_SYMBOL_GPL(xprtiod_workqueue); unsigned long rpc_task_timeout(const struct rpc_task *task) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ad21880d5601..b1f54b7ccc0c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -850,6 +850,38 @@ void xprt_connect(struct rpc_task *task) xprt_release_write(xprt, task); } +/** + * xprt_reconnect_delay - compute the wait before scheduling a connect + * @xprt: transport instance + * + */ +unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt) +{ + unsigned long start, now = jiffies; + + start = xprt->stat.connect_start + xprt->reestablish_timeout; + if (time_after(start, now)) + return start - now; + return 0; +} +EXPORT_SYMBOL_GPL(xprt_reconnect_delay); + +/** + * xprt_reconnect_backoff - compute the new re-establish timeout + * @xprt: transport instance + * @init_to: initial reestablish timeout + * + */ +void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to) +{ + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > xprt->max_reconnect_timeout) + xprt->reestablish_timeout = xprt->max_reconnect_timeout; + if (xprt->reestablish_timeout < init_to) + xprt->reestablish_timeout = init_to; +} +EXPORT_SYMBOL_GPL(xprt_reconnect_backoff); + enum xprt_xid_rb_cmp { XID_RB_EQUAL, XID_RB_LEFT, diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 3688e0782587..4993aa49ecbe 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -298,6 +298,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } +/* 60 second timeout, no retries */ static const struct rpc_timeout xprt_rdma_default_timeout = { .to_initval = 60 * HZ, .to_maxval = 60 * HZ, @@ -323,8 +324,9 @@ xprt_setup_rdma(struct xprt_create *args) if (!xprt) return ERR_PTR(-ENOMEM); - /* 60 second timeout, no retries */ xprt->timeout = &xprt_rdma_default_timeout; + xprt->connect_timeout = xprt->timeout->to_initval; + xprt->max_reconnect_timeout = xprt->timeout->to_maxval; xprt->bind_timeout = RPCRDMA_BIND_TO; xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; @@ -487,31 +489,64 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) } /** - * xprt_rdma_connect - try to establish a transport connection + * xprt_rdma_set_connect_timeout - set timeouts for establishing a connection + * @xprt: controlling transport instance + * @connect_timeout: reconnect timeout after client disconnects + * @reconnect_timeout: reconnect timeout after server disconnects + * + */ +static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout, + unsigned long reconnect_timeout) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + trace_xprtrdma_op_set_cto(r_xprt, connect_timeout, reconnect_timeout); + + spin_lock(&xprt->transport_lock); + + if (connect_timeout < xprt->connect_timeout) { + struct rpc_timeout to; + unsigned long initval; + + to = *xprt->timeout; + initval = connect_timeout; + if (initval < RPCRDMA_INIT_REEST_TO << 1) + initval = RPCRDMA_INIT_REEST_TO << 1; + to.to_initval = initval; + to.to_maxval = initval; + r_xprt->rx_timeout = to; + xprt->timeout = &r_xprt->rx_timeout; + xprt->connect_timeout = connect_timeout; + } + + if (reconnect_timeout < xprt->max_reconnect_timeout) + xprt->max_reconnect_timeout = reconnect_timeout; + + spin_unlock(&xprt->transport_lock); +} + +/** + * xprt_rdma_connect - schedule an attempt to reconnect * @xprt: transport state - * @task: RPC scheduler context + * @task: RPC scheduler context (unused) * */ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + unsigned long delay; trace_xprtrdma_op_connect(r_xprt); + + delay = 0; if (r_xprt->rx_ep.rep_connected != 0) { - /* Reconnect */ - schedule_delayed_work(&r_xprt->rx_connect_worker, - xprt->reestablish_timeout); - xprt->reestablish_timeout <<= 1; - if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) - xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; - else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) - xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - } else { - schedule_delayed_work(&r_xprt->rx_connect_worker, 0); - if (!RPC_IS_ASYNC(task)) - flush_delayed_work(&r_xprt->rx_connect_worker); + delay = xprt_reconnect_delay(xprt); + xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } + queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker, + delay); } /** @@ -769,6 +804,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { .send_request = xprt_rdma_send_request, .close = xprt_rdma_close, .destroy = xprt_rdma_destroy, + .set_connect_timeout = xprt_rdma_tcp_set_connect_timeout, .print_stats = xprt_rdma_print_stats, .enable_swap = xprt_rdma_enable_swap, .disable_swap = xprt_rdma_disable_swap, diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 117e32816e4f..8378f45d2da7 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -432,6 +432,7 @@ struct rpcrdma_xprt { struct rpcrdma_ep rx_ep; struct rpcrdma_buffer rx_buf; struct delayed_work rx_connect_worker; + struct rpc_timeout rx_timeout; struct rpcrdma_stats rx_stats; }; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c69951ed2ebc..b154600085d6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2402,25 +2402,6 @@ out: xprt_wake_pending_tasks(xprt, status); } -static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt) -{ - unsigned long start, now = jiffies; - - start = xprt->stat.connect_start + xprt->reestablish_timeout; - if (time_after(start, now)) - return start - now; - return 0; -} - -static void xs_reconnect_backoff(struct rpc_xprt *xprt) -{ - xprt->reestablish_timeout <<= 1; - if (xprt->reestablish_timeout > xprt->max_reconnect_timeout) - xprt->reestablish_timeout = xprt->max_reconnect_timeout; - if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) - xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; -} - /** * xs_connect - connect a socket to a remote endpoint * @xprt: pointer to transport structure @@ -2450,8 +2431,8 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) /* Start by resetting any existing state */ xs_reset_transport(transport); - delay = xs_reconnect_delay(xprt); - xs_reconnect_backoff(xprt); + delay = xprt_reconnect_delay(xprt); + xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO); } else dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); -- cgit v1.2.3 From a101b043c44dfcb63bed7f29a675e9fa0259005e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 11 Jul 2019 16:33:12 -0400 Subject: SUNRPC: Fix transport accounting when caller specifies an rpc_xprt Ensure that we do the required accounting for the round robin queue when the caller to rpc_init_task() has passed in a transport to be used. Reported-by: Olga Kornievskaia Reported-by: Neil Brown Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 42 ++++++++++++++++++++---------------------- net/sunrpc/sched.c | 3 ++- 3 files changed, 24 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 4619098affa3..4e070e00c143 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -164,6 +164,8 @@ void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); void rpc_task_release_transport(struct rpc_task *); void rpc_task_release_client(struct rpc_task *); +struct rpc_xprt *rpc_task_get_xprt(struct rpc_clnt *clnt, + struct rpc_xprt *xprt); int rpcb_create_local(struct net *); void rpcb_put_local(struct net *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d599fab8adcb..383555d2b522 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -978,11 +978,10 @@ out: } EXPORT_SYMBOL_GPL(rpc_bind_new_program); -static struct rpc_xprt * -rpc_task_get_xprt(struct rpc_clnt *clnt) +struct rpc_xprt * +rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { struct rpc_xprt_switch *xps; - struct rpc_xprt *xprt= xprt_iter_get_next(&clnt->cl_xpi); if (!xprt) return NULL; @@ -995,24 +994,6 @@ rpc_task_get_xprt(struct rpc_clnt *clnt) return xprt; } -static struct rpc_xprt * -rpc_task_get_first_xprt(struct rpc_clnt *clnt) -{ - struct rpc_xprt_switch *xps; - struct rpc_xprt *xprt; - - rcu_read_lock(); - xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); - if (xprt) { - atomic_long_inc(&xprt->queuelen); - xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); - atomic_long_inc(&xps->xps_queuelen); - } - rcu_read_unlock(); - - return xprt; -} - static void rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { @@ -1057,6 +1038,23 @@ void rpc_task_release_client(struct rpc_task *task) } } +static struct rpc_xprt * +rpc_task_get_first_xprt(struct rpc_clnt *clnt) +{ + struct rpc_xprt *xprt; + + rcu_read_lock(); + xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + rcu_read_unlock(); + return rpc_task_get_xprt(clnt, xprt); +} + +static struct rpc_xprt * +rpc_task_get_next_xprt(struct rpc_clnt *clnt) +{ + return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi)); +} + static void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) { @@ -1065,7 +1063,7 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) task->tk_xprt = rpc_task_get_first_xprt(clnt); else - task->tk_xprt = rpc_task_get_xprt(clnt); + task->tk_xprt = rpc_task_get_next_xprt(clnt); } static diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 8a0779e963f9..1f275aba786f 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1092,7 +1092,8 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta /* Initialize workqueue for async tasks */ task->tk_workqueue = task_setup_data->workqueue; - task->tk_xprt = xprt_get(task_setup_data->rpc_xprt); + task->tk_xprt = rpc_task_get_xprt(task_setup_data->rpc_client, + xprt_get(task_setup_data->rpc_xprt)); task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred); -- cgit v1.2.3 From 7402a4fedc2bc448100c2d086406c708451b16dc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 16 Jul 2019 13:51:29 -0400 Subject: SUNRPC: Fix up backchannel slot table accounting Add a per-transport maximum limit in the socket case, and add helpers to allow the NFSv4 code to discover that limit. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +++ include/linux/sunrpc/bc_xprt.h | 1 + include/linux/sunrpc/clnt.h | 1 + include/linux/sunrpc/xprt.h | 6 ++++-- net/sunrpc/backchannel_rqst.c | 40 +++++++++++++++++++++------------------ net/sunrpc/clnt.c | 13 +++++++++++++ net/sunrpc/svc.c | 2 +- net/sunrpc/xprtrdma/backchannel.c | 7 +++++++ net/sunrpc/xprtrdma/transport.c | 1 + net/sunrpc/xprtrdma/xprt_rdma.h | 1 + net/sunrpc/xprtsock.c | 1 + 11 files changed, 55 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 52de7245a2ee..39896afc6edf 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8380,6 +8380,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, { unsigned int max_rqst_sz, max_resp_sz; unsigned int max_bc_payload = rpc_max_bc_payload(clnt); + unsigned int max_bc_slots = rpc_num_bc_slots(clnt); max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; @@ -8402,6 +8403,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; args->bc_attrs.max_reqs = max_t(unsigned short, max_session_cb_slots, 1); + if (args->bc_attrs.max_reqs > max_bc_slots) + args->bc_attrs.max_reqs = max_bc_slots; dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index d4229a78524a..87d27e13d885 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -43,6 +43,7 @@ void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs); int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs); void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs); void xprt_free_bc_rqst(struct rpc_rqst *req); +unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt); /* * Determine if a shared backchannel is in use diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 4e070e00c143..abc63bd1be2b 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -194,6 +194,7 @@ void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int); struct net * rpc_net_ns(struct rpc_clnt *); size_t rpc_max_payload(struct rpc_clnt *); size_t rpc_max_bc_payload(struct rpc_clnt *); +unsigned int rpc_num_bc_slots(struct rpc_clnt *); void rpc_force_rebind(struct rpc_clnt *); size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t); const char *rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index ed76e5fb36c1..13e108bcc9eb 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -158,6 +158,7 @@ struct rpc_xprt_ops { int (*bc_setup)(struct rpc_xprt *xprt, unsigned int min_reqs); size_t (*bc_maxpayload)(struct rpc_xprt *xprt); + unsigned int (*bc_num_slots)(struct rpc_xprt *xprt); void (*bc_free_rqst)(struct rpc_rqst *rqst); void (*bc_destroy)(struct rpc_xprt *xprt, unsigned int max_reqs); @@ -251,8 +252,9 @@ struct rpc_xprt { #if defined(CONFIG_SUNRPC_BACKCHANNEL) struct svc_serv *bc_serv; /* The RPC service which will */ /* process the callback */ - int bc_alloc_count; /* Total number of preallocs */ - atomic_t bc_free_slots; + unsigned int bc_alloc_max; + unsigned int bc_alloc_count; /* Total number of preallocs */ + atomic_t bc_slot_count; /* Number of allocated slots */ spinlock_t bc_pa_lock; /* Protects the preallocated * items */ struct list_head bc_pa_list; /* List of preallocated diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index c47d82622fd1..339e8c077c2d 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -31,25 +31,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RPCDBG_FACILITY RPCDBG_TRANS #endif +#define BC_MAX_SLOTS 64U + +unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt) +{ + return BC_MAX_SLOTS; +} + /* * Helper routines that track the number of preallocation elements * on the transport. */ static inline int xprt_need_to_requeue(struct rpc_xprt *xprt) { - return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots); -} - -static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n) -{ - atomic_add(n, &xprt->bc_free_slots); - xprt->bc_alloc_count += n; -} - -static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n) -{ - atomic_sub(n, &xprt->bc_free_slots); - return xprt->bc_alloc_count -= n; + return xprt->bc_alloc_count < xprt->bc_alloc_max; } /* @@ -145,6 +140,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) dprintk("RPC: setup backchannel transport\n"); + if (min_reqs > BC_MAX_SLOTS) + min_reqs = BC_MAX_SLOTS; + /* * We use a temporary list to keep track of the preallocated * buffers. Once we're done building the list we splice it @@ -172,7 +170,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) */ spin_lock(&xprt->bc_pa_lock); list_splice(&tmp_list, &xprt->bc_pa_list); - xprt_inc_alloc_count(xprt, min_reqs); + xprt->bc_alloc_count += min_reqs; + xprt->bc_alloc_max += min_reqs; + atomic_add(min_reqs, &xprt->bc_slot_count); spin_unlock(&xprt->bc_pa_lock); dprintk("RPC: setup backchannel transport done\n"); @@ -220,11 +220,13 @@ void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs) goto out; spin_lock_bh(&xprt->bc_pa_lock); - xprt_dec_alloc_count(xprt, max_reqs); + xprt->bc_alloc_max -= max_reqs; list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { dprintk("RPC: req=%p\n", req); list_del(&req->rq_bc_pa_list); xprt_free_allocation(req); + xprt->bc_alloc_count--; + atomic_dec(&xprt->bc_slot_count); if (--max_reqs == 0) break; } @@ -241,13 +243,14 @@ static struct rpc_rqst *xprt_get_bc_request(struct rpc_xprt *xprt, __be32 xid, struct rpc_rqst *req = NULL; dprintk("RPC: allocate a backchannel request\n"); - if (atomic_read(&xprt->bc_free_slots) <= 0) - goto not_found; if (list_empty(&xprt->bc_pa_list)) { if (!new) goto not_found; + if (atomic_read(&xprt->bc_slot_count) >= BC_MAX_SLOTS) + goto not_found; list_add_tail(&new->rq_bc_pa_list, &xprt->bc_pa_list); xprt->bc_alloc_count++; + atomic_inc(&xprt->bc_slot_count); } req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst, rq_bc_pa_list); @@ -291,6 +294,7 @@ void xprt_free_bc_rqst(struct rpc_rqst *req) if (xprt_need_to_requeue(xprt)) { list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); xprt->bc_alloc_count++; + atomic_inc(&xprt->bc_slot_count); req = NULL; } spin_unlock_bh(&xprt->bc_pa_lock); @@ -357,7 +361,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied) spin_lock(&xprt->bc_pa_lock); list_del(&req->rq_bc_pa_list); - xprt_dec_alloc_count(xprt, 1); + xprt->bc_alloc_count--; spin_unlock(&xprt->bc_pa_lock); req->rq_private_buf.len = copied; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 383555d2b522..79c849391cb9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1526,6 +1526,19 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_max_bc_payload); +unsigned int rpc_num_bc_slots(struct rpc_clnt *clnt) +{ + struct rpc_xprt *xprt; + unsigned int ret; + + rcu_read_lock(); + xprt = rcu_dereference(clnt->cl_xprt); + ret = xprt->ops->bc_num_slots(xprt); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_num_bc_slots); + /** * rpc_force_rebind - force transport to check that remote port is unchanged * @clnt: client to rebind diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e15cb704453e..220b79988000 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1595,7 +1595,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, /* Parse and execute the bc call */ proc_error = svc_process_common(rqstp, argv, resv); - atomic_inc(&req->rq_xprt->bc_free_slots); + atomic_dec(&req->rq_xprt->bc_slot_count); if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index ce986591f213..59e624b1d7a0 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -52,6 +52,13 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) return maxmsg - RPCRDMA_HDRLEN_MIN; } +unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + return r_xprt->rx_buf.rb_bc_srv_max_requests; +} + static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 4993aa49ecbe..52abddac19e5 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -812,6 +812,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { #if defined(CONFIG_SUNRPC_BACKCHANNEL) .bc_setup = xprt_rdma_bc_setup, .bc_maxpayload = xprt_rdma_bc_maxpayload, + .bc_num_slots = xprt_rdma_bc_max_slots, .bc_free_rqst = xprt_rdma_bc_free_rqst, .bc_destroy = xprt_rdma_bc_destroy, #endif diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8378f45d2da7..92ce09fcea74 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -605,6 +605,7 @@ void xprt_rdma_cleanup(void); #if defined(CONFIG_SUNRPC_BACKCHANNEL) int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); +unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *); int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 3c2cc96afcaa..6b1fca51028a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2788,6 +2788,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = { #ifdef CONFIG_SUNRPC_BACKCHANNEL .bc_setup = xprt_setup_bc, .bc_maxpayload = xs_tcp_bc_maxpayload, + .bc_num_slots = xprt_bc_max_slots, .bc_free_rqst = xprt_free_bc_rqst, .bc_destroy = xprt_destroy_bc, #endif -- cgit v1.2.3 From d5b9216fd5114be4ed98ca9c1ecc5f164cd8cf5e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 18 Jul 2019 09:32:17 -0400 Subject: pnfs/flexfiles: Add tracepoints for detecting pnfs fallback to MDS Add tracepoints to allow debugging of the event chain leading to a pnfs fallback to doing I/O through the MDS. Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 26 ++++++++++++ fs/nfs/nfs4trace.c | 8 ++++ fs/nfs/nfs4trace.h | 76 +++++++++++++++++++++++++++++++++- fs/nfs/pnfs.c | 2 + include/linux/nfs4.h | 1 + 5 files changed, 112 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index bcff3bf5ae09..b04e20d28162 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -934,6 +934,10 @@ out_nolseg: if (pgio->pg_error < 0) return; out_mds: + trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode, + 0, NFS4_MAX_UINT64, IOMODE_READ, + NFS_I(pgio->pg_inode)->layout, + pgio->pg_lseg); pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_read_mds(pgio); @@ -1000,6 +1004,10 @@ retry: return; out_mds: + trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode, + 0, NFS4_MAX_UINT64, IOMODE_RW, + NFS_I(pgio->pg_inode)->layout, + pgio->pg_lseg); pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); @@ -1026,6 +1034,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, if (pgio->pg_lseg) return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); + trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode, + 0, NFS4_MAX_UINT64, IOMODE_RW, + NFS_I(pgio->pg_inode)->layout, + pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ nfs_pageio_reset_write_mds(pgio); out: @@ -1075,6 +1087,10 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) hdr->args.count, (unsigned long long)hdr->args.offset); + trace_pnfs_mds_fallback_write_done(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_RW, NFS_I(hdr->inode)->layout, + hdr->lseg); task->tk_status = pnfs_write_done_resend_to_mds(hdr); } } @@ -1094,6 +1110,10 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr) hdr->args.count, (unsigned long long)hdr->args.offset); + trace_pnfs_mds_fallback_read_done(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_READ, NFS_I(hdr->inode)->layout, + hdr->lseg); task->tk_status = pnfs_read_done_resend_to_mds(hdr); } } @@ -1827,6 +1847,9 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) out_failed: if (ff_layout_avoid_mds_available_ds(lseg)) return PNFS_TRY_AGAIN; + trace_pnfs_mds_fallback_read_pagelist(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_READ, NFS_I(hdr->inode)->layout, lseg); return PNFS_NOT_ATTEMPTED; } @@ -1892,6 +1915,9 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) out_failed: if (ff_layout_avoid_mds_available_ds(lseg)) return PNFS_TRY_AGAIN; + trace_pnfs_mds_fallback_write_pagelist(hdr->inode, + hdr->args.offset, hdr->args.count, + IOMODE_RW, NFS_I(hdr->inode)->layout, lseg); return PNFS_NOT_ATTEMPTED; } diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index e9fb3e50a999..1a8f376b3f73 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -16,4 +16,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds); + +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_get_mirror_count); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist); +EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist); #endif diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index d85f20945a2b..b2f395fa7350 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1771,6 +1771,7 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED); TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN); TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY); TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); +TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_EXIT); #define show_pnfs_update_layout_reason(reason) \ __print_symbolic(reason, \ @@ -1786,7 +1787,8 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \ { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \ - { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) + { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }, \ + { PNFS_UPDATE_LAYOUT_EXIT, "exit" }) TRACE_EVENT(pnfs_update_layout, TP_PROTO(struct inode *inode, @@ -1845,6 +1847,78 @@ TRACE_EVENT(pnfs_update_layout, ) ); +DECLARE_EVENT_CLASS(pnfs_layout_event, + TP_PROTO(struct inode *inode, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg + ), + TP_ARGS(inode, pos, count, iomode, lo, lseg), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, pos) + __field(u64, count) + __field(enum pnfs_iomode, iomode) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) + __field(long, lseg) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->pos = pos; + __entry->count = count; + __entry->iomode = iomode; + if (lo != NULL) { + __entry->layoutstateid_seq = + be32_to_cpu(lo->plh_stateid.seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(&lo->plh_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } + __entry->lseg = (long)lseg; + ), + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s pos=%llu count=%llu " + "layoutstateid=%d:0x%08x lseg=0x%lx", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_iomode(__entry->iomode), + (unsigned long long)__entry->pos, + (unsigned long long)__entry->count, + __entry->layoutstateid_seq, __entry->layoutstateid_hash, + __entry->lseg + ) +); + +#define DEFINE_PNFS_LAYOUT_EVENT(name) \ + DEFINE_EVENT(pnfs_layout_event, name, \ + TP_PROTO(struct inode *inode, \ + loff_t pos, \ + u64 count, \ + enum pnfs_iomode iomode, \ + struct pnfs_layout_hdr *lo, \ + struct pnfs_layout_segment *lseg \ + ), \ + TP_ARGS(inode, pos, count, iomode, lo, lseg)) + +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_read); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_write); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_get_mirror_count); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_done); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist); +DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist); + #endif /* CONFIG_NFS_V4_1 */ #endif /* _TRACE_NFS4_H */ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 758917463700..75bd5b552ba4 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2037,6 +2037,8 @@ lookup_again: out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_EXIT); pnfs_put_layout_hdr(lo); out: dprintk("%s: inode %s/%llu pNFS layout segment %s for " diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 22494d170619..fd59904a282c 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -660,6 +660,7 @@ enum pnfs_update_layout_reason { PNFS_UPDATE_LAYOUT_BLOCKED, PNFS_UPDATE_LAYOUT_INVALID_OPEN, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, + PNFS_UPDATE_LAYOUT_EXIT, }; #define NFS4_OP_MAP_NUM_LONGS \ -- cgit v1.2.3