Browse code

Add 9p zero copy data path using crossfd

Add new rpc calls to 9p protocol to allow zero copy
read and write requests from the client to the server.
The client sends only the page addresses of the io
buffer where the data is to be transferred, and the
server directly does the data transfer to that buffer,
and does not send or receive the data over the transport.

These calls are allowed only over vsock transport,
where both the client and the server are on same host,
and server can transfer the data to client memory using
the crossfd feature of ESXi.

Change-Id: I43d35dd27c653e2a801e2e7b2d3a59af0fc06d11
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/5800
Tested-by: gerrit-photon <photon-checkins@vmware.com>
Reviewed-by: Alexey Makhalov <amakhalov@vmware.com>

Kamal Charan authored on 2018/09/26 19:34:39
Showing 3 changed files
... ...
@@ -2,7 +2,7 @@
2 2
 Summary:       Kernel
3 3
 Name:          linux-esx
4 4
 Version:       4.4.164
5
-Release:       1%{?dist}
5
+Release:       2%{?dist}
6 6
 License:       GPLv2
7 7
 URL:           http://www.kernel.org/
8 8
 Group:         System Environment/Kernel
... ...
@@ -62,6 +62,7 @@ Patch45:        0005-xfs-sanity-check-inode-di_mode.patch
62 62
 Patch46:        0006-xfs-verify-dinode-header-first.patch
63 63
 Patch47:        0007-xfs-move-inode-fork-verifiers-to-xfs_dinode_verify.patch
64 64
 Patch48:        0008-xfs-enhance-dinode-verifier.patch
65
+Patch49:        net-9p-vdfs-zerocopy.patch
65 66
 
66 67
 # For Spectre
67 68
 Patch67: 0169-x86-syscall-Clear-unused-extra-registers-on-syscall-.patch
... ...
@@ -145,6 +146,7 @@ The Linux package contains the Linux kernel doc files
145 145
 %patch46 -p1
146 146
 %patch47 -p1
147 147
 %patch48 -p1
148
+%patch49 -p1
148 149
 
149 150
 %patch67 -p1
150 151
 
... ...
@@ -235,6 +237,8 @@ ln -sf linux-%{uname_r}.cfg /boot/photon.cfg
235 235
 /usr/src/linux-headers-%{uname_r}
236 236
 
237 237
 %changelog
238
+*   Wed Dec 12 2018 Kamal Charan <kcharan@vmware.com> 4.4.164-2
239
+-   Add 9p zero copy data path using crossfd
238 240
 *   Mon Nov 26 2018 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.4.164-1
239 241
 -   Update to version 4.4.164
240 242
 *   Wed Nov 14 2018 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.4.163-1
... ...
@@ -2,7 +2,7 @@
2 2
 Summary:        Kernel
3 3
 Name:           linux
4 4
 Version:    	4.4.164
5
-Release:        1%{?kat_build:.%kat_build}%{?dist}
5
+Release:        2%{?kat_build:.%kat_build}%{?dist}
6 6
 License:    	GPLv2
7 7
 URL:        	http://www.kernel.org/
8 8
 Group:        	System Environment/Kernel
... ...
@@ -62,6 +62,7 @@ Patch38:        0005-xfs-sanity-check-inode-di_mode.patch
62 62
 Patch39:        0006-xfs-verify-dinode-header-first.patch
63 63
 Patch40:        0007-xfs-move-inode-fork-verifiers-to-xfs_dinode_verify.patch
64 64
 Patch41:        0008-xfs-enhance-dinode-verifier.patch
65
+Patch42:        net-9p-vdfs-zerocopy.patch
65 66
 
66 67
 # For Spectre
67 68
 Patch67: 0169-x86-syscall-Clear-unused-extra-registers-on-syscall-.patch
... ...
@@ -178,6 +179,7 @@ This package contains the 'perf' performance analysis tools for Linux kernel.
178 178
 %patch39 -p1
179 179
 %patch40 -p1
180 180
 %patch41 -p1
181
+%patch42 -p1
181 182
 
182 183
 %patch67 -p1
183 184
 
... ...
@@ -336,6 +338,8 @@ ln -sf %{name}-%{uname_r}.cfg /boot/photon.cfg
336 336
 /usr/share/perf-core
337 337
 
338 338
 %changelog
339
+*   Wed Dec 12 2018 Kamal Charan <kcharan@vmware.com> 4.4.164-2
340
+-   Add 9p zero copy data path using crossfd
339 341
 *   Mon Nov 26 2018 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.4.164-1
340 342
 -   Update to version 4.4.164
341 343
 *   Wed Nov 14 2018 Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu> 4.4.163-1
342 344
new file mode 100644
... ...
@@ -0,0 +1,740 @@
0
+From 606890c5871aec6f4974676404014dfd939141ec Mon Sep 17 00:00:00 2001
1
+From: Kamal Charan <kcharan@vmware.com>
2
+Date: Wed, 12 Dec 2018 02:43:51 -0800
3
+Subject: [PATCH] Add 9p zero copy data path using crossfd
4
+
5
+Add new rpc calls to 9p protocol to allow zero copy
6
+read and write requests from the client to the server.
7
+The client sends only the page addresses of the io
8
+buffer where the data is to be transferred, and the
9
+server directly does the data transfer to that buffer,
10
+and does not send or receive the data over the transport.
11
+
12
+These calls are allowed only over vsock transport,
13
+where both the client and the server are on same host,
14
+and server can transfer the data to client memory using
15
+the crossfd feature of ESXi.
16
+---
17
+ include/net/9p/9p.h     |   8 +
18
+ include/net/9p/client.h |   6 +
19
+ net/9p/client.c         | 469 ++++++++++++++++++++++++++++++++++++++----------
20
+ net/9p/protocol.c       |  20 ++-
21
+ 4 files changed, 405 insertions(+), 98 deletions(-)
22
+
23
+diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
24
+index 27dfe85..48dd4c4 100644
25
+--- a/include/net/9p/9p.h
26
+@@ -109,6 +109,10 @@ void _p9_debug(enum p9_debug_flags level, const char *func,
27
+  * @P9_RREAD: response with data requested
28
+  * @P9_TWRITE: reuqest to transfer data to a file
29
+  * @P9_RWRITE: response with out much data was transferred to file
30
++ * @P9_TREADX: request to zero copy data from a file to user buffer
31
++ * @P9_RREADX: response with how much data was transferred from file
32
++ * @P9_TWRITEX: reuqest to zero copy data to a file from user buffer
33
++ * @P9_RWRITEX: response with how much data was transferred to file
34
+  * @P9_TCLUNK: forget about a handle to an entity within the file system
35
+  * @P9_RCLUNK: response when server has forgotten about the handle
36
+  * @P9_TREMOVE: request to remove an entity from the hierarchy
37
+@@ -168,6 +172,10 @@ enum p9_msg_t {
38
+ 	P9_RRENAMEAT,
39
+ 	P9_TUNLINKAT = 76,
40
+ 	P9_RUNLINKAT,
41
++	P9_TREADX = 96,
42
++	P9_RREADX,
43
++	P9_TWRITEX = 98,
44
++	P9_RWRITEX,
45
+ 	P9_TVERSION = 100,
46
+ 	P9_RVERSION,
47
+ 	P9_TAUTH = 102,
48
+diff --git a/include/net/9p/client.h b/include/net/9p/client.h
49
+index c6b97e5..fd2c189 100644
50
+--- a/include/net/9p/client.h
51
+@@ -35,12 +35,14 @@
52
+  * @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u
53
+  * @p9_proto_2000u: 9P2000.u extension
54
+  * @p9_proto_2000L: 9P2000.L extension
55
++ * @p9_proto_2000X: 9P2000.X extension      // dotx zero copy
56
+  */
57
+ 
58
+ enum p9_proto_versions{
59
+ 	p9_proto_legacy,
60
+ 	p9_proto_2000u,
61
+ 	p9_proto_2000L,
62
++	p9_proto_2000X,
63
+ };
64
+ 
65
+ 
66
+@@ -124,6 +126,8 @@ struct p9_req_t {
67
+  * struct p9_client - per client instance state
68
+  * @lock: protect @fidlist
69
+  * @msize: maximum data size negotiated by protocol
70
++ * @minzcpages: minimum number of pages for dotx zero copy
71
++ * @is_dotx_ok: whether zero copy io using dotx can be used
72
+  * @dotu: extension flags negotiated by protocol
73
+  * @proto_version: 9P protocol version to use
74
+  * @trans_mod: module API instantiated with this client
75
+@@ -152,6 +156,8 @@ struct p9_req_t {
76
+ struct p9_client {
77
+ 	spinlock_t lock; /* protect client structure */
78
+ 	unsigned int msize;
79
++	unsigned int minzcpages;
80
++	unsigned int is_dotx_ok;
81
+ 	unsigned char proto_version;
82
+ 	struct p9_trans_module *trans_mod;
83
+ 	enum p9_trans_status status;
84
+diff --git a/net/9p/client.c b/net/9p/client.c
85
+index ed8738c4..cb08602 100644
86
+--- a/net/9p/client.c
87
+@@ -25,6 +25,7 @@
88
+ 
89
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
90
+ 
91
++#include <linux/mm.h>
92
+ #include <linux/module.h>
93
+ #include <linux/errno.h>
94
+ #include <linux/fs.h>
95
+@@ -44,6 +45,9 @@
96
+ #define CREATE_TRACE_POINTS
97
+ #include <trace/events/9p.h>
98
+ 
99
++#define DOTX_ZC_MAXPAGES 1024
100
++#define DOTX_ZC_TRANSPORT "vsock"
101
++
102
+ /*
103
+   * Client Option Parsing (code inspired by NFS code)
104
+   *  - a little lazy - parse all client options
105
+@@ -54,6 +58,7 @@ enum {
106
+ 	Opt_trans,
107
+ 	Opt_legacy,
108
+ 	Opt_version,
109
++	Opt_minzcpages,
110
+ 	Opt_err,
111
+ };
112
+ 
113
+@@ -62,12 +67,14 @@ static const match_table_t tokens = {
114
+ 	{Opt_legacy, "noextend"},
115
+ 	{Opt_trans, "trans=%s"},
116
+ 	{Opt_version, "version=%s"},
117
++	{Opt_minzcpages, "minzcpages=%d"},
118
+ 	{Opt_err, NULL},
119
+ };
120
+ 
121
+ inline int p9_is_proto_dotl(struct p9_client *clnt)
122
+ {
123
+-	return clnt->proto_version == p9_proto_2000L;
124
++	return clnt->proto_version == p9_proto_2000L ||
125
++	       clnt->proto_version == p9_proto_2000X;
126
+ }
127
+ EXPORT_SYMBOL(p9_is_proto_dotl);
128
+ 
129
+@@ -77,6 +84,12 @@ inline int p9_is_proto_dotu(struct p9_client *clnt)
130
+ }
131
+ EXPORT_SYMBOL(p9_is_proto_dotu);
132
+ 
133
++inline int p9_is_proto_dotx(struct p9_client *clnt)
134
++{
135
++	return clnt->proto_version == p9_proto_2000X;
136
++}
137
++EXPORT_SYMBOL(p9_is_proto_dotx);
138
++
139
+ /*
140
+  * Some error codes are taken directly from the server replies,
141
+  * make sure they are valid.
142
+@@ -105,6 +118,9 @@ static int get_protocol_version(char *s)
143
+ 	} else if (!strcmp(s, "9p2000.L")) {
144
+ 		version = p9_proto_2000L;
145
+ 		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n");
146
++	} else if (!strcmp(s, "9p2000.X")) {
147
++		version = p9_proto_2000X;
148
++		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.X\n");
149
+ 	} else
150
+ 		pr_info("Unknown protocol version %s\n", s);
151
+ 
152
+@@ -179,6 +195,16 @@ static int parse_opts(char *opts, struct p9_client *clnt)
153
+ 		case Opt_legacy:
154
+ 			clnt->proto_version = p9_proto_legacy;
155
+ 			break;
156
++		case Opt_minzcpages:
157
++			r = match_int(&args[0], &option);
158
++			if (r < 0 || option < 0) {
159
++				p9_debug(P9_DEBUG_ERROR,
160
++					 "integer field, but no/negative integer?\n");
161
++				ret = r;
162
++				continue;
163
++			}
164
++			clnt->minzcpages = option;
165
++			break;
166
+ 		case Opt_version:
167
+ 			s = match_strdup(&args[0]);
168
+ 			if (!s) {
169
+@@ -199,6 +225,16 @@ static int parse_opts(char *opts, struct p9_client *clnt)
170
+ 			continue;
171
+ 		}
172
+ 	}
173
++	if (p9_is_proto_dotx(clnt) &&
174
++	    (!clnt->trans_mod ||
175
++	     strcmp(clnt->trans_mod->name, DOTX_ZC_TRANSPORT))) {
176
++		p9_debug(P9_DEBUG_ERROR,
177
++			 "dotx version requires %s transport",
178
++			 DOTX_ZC_TRANSPORT);
179
++		ret = -EINVAL;
180
++		goto free_and_return;
181
++	}
182
++
183
+ 
184
+ free_and_return:
185
+ 	kfree(tmp_options);
186
+@@ -938,6 +974,10 @@ static int p9_client_version(struct p9_client *c)
187
+ 		 c->msize, c->proto_version);
188
+ 
189
+ 	switch (c->proto_version) {
190
++	case p9_proto_2000X:
191
++		req = p9_client_rpc(c, P9_TVERSION, "ds",
192
++					c->msize, "9P2000.X");
193
++		break;
194
+ 	case p9_proto_2000L:
195
+ 		req = p9_client_rpc(c, P9_TVERSION, "ds",
196
+ 					c->msize, "9P2000.L");
197
+@@ -965,7 +1005,9 @@ static int p9_client_version(struct p9_client *c)
198
+ 	}
199
+ 
200
+ 	p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
201
+-	if (!strncmp(version, "9P2000.L", 8))
202
++	if (!strncmp(version, "9P2000.X", 8))
203
++		c->proto_version = p9_proto_2000X;
204
++	else if (!strncmp(version, "9P2000.L", 8))
205
+ 		c->proto_version = p9_proto_2000L;
206
+ 	else if (!strncmp(version, "9P2000.u", 8))
207
+ 		c->proto_version = p9_proto_2000u;
208
+@@ -999,6 +1041,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
209
+ 
210
+ 	clnt->trans_mod = NULL;
211
+ 	clnt->trans = NULL;
212
++	clnt->is_dotx_ok = 0;
213
++	clnt->minzcpages = 1;
214
+ 
215
+ 	client_id = utsname()->nodename;
216
+ 	memcpy(clnt->name, client_id, strlen(client_id) + 1);
217
+@@ -1534,136 +1578,368 @@ error:
218
+ }
219
+ EXPORT_SYMBOL(p9_client_unlinkat);
220
+ 
221
++static int
222
++dotx_can_zc(const struct iov_iter *iter, struct p9_client *clnt)
223
++{
224
++	return iter_is_iovec(iter) &&
225
++	       clnt->is_dotx_ok &&
226
++	       iov_iter_count(iter) >= clnt->minzcpages * PAGE_SIZE;
227
++}
228
++
229
++static int
230
++p9_client_read_nodotx(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
231
++{
232
++	struct p9_client *clnt = fid->clnt;
233
++	struct p9_req_t *req;
234
++	int total = 0;
235
++	int count = iov_iter_count(to);
236
++	int rsize, non_zc = 0;
237
++	char *dataptr;
238
++
239
++	*err = 0;
240
++
241
++	rsize = fid->iounit;
242
++	if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
243
++		rsize = clnt->msize - P9_IOHDRSZ;
244
++
245
++	if (count < rsize)
246
++		rsize = count;
247
++
248
++	/* Don't bother zerocopy for small IO (< 1024) */
249
++	if (clnt->trans_mod->zc_request && rsize > 1024) {
250
++		/*
251
++		 * response header len is 11
252
++		 * PDU Header(7) + IO Size (4)
253
++		 */
254
++		req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
255
++				       0, 11, "dqd", fid->fid, offset, rsize);
256
++	} else {
257
++		non_zc = 1;
258
++		req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
259
++				    rsize);
260
++	}
261
++
262
++	if (IS_ERR(req)) {
263
++		p9_debug(P9_DEBUG_9P, "          rpc error\n");
264
++		*err = PTR_ERR(req);
265
++		return 0;
266
++	}
267
++
268
++	*err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr);
269
++	if (*err) {
270
++		p9_debug(P9_DEBUG_9P, "          ret err %d \n", *err);
271
++		trace_9p_protocol_dump(clnt, req->rc);
272
++		goto error;
273
++	}
274
++	if (rsize < count) {
275
++		pr_err("bogus RREAD count (%d > %d)\n", count, rsize);
276
++		count = rsize;
277
++	}
278
++
279
++	if (count == 0)
280
++		goto error;
281
++
282
++	if (non_zc) {
283
++		int n = copy_to_iter(dataptr, count, to);
284
++		if (n != count) {
285
++			*err = -EFAULT;
286
++			total = n;
287
++			goto error;
288
++		}
289
++	} else {
290
++		iov_iter_advance(to, count);
291
++	}
292
++	total = count;
293
++
294
++error:
295
++	p9_free_req(clnt, req);
296
++	return total;
297
++}
298
++
299
++static int
300
++p9_client_read_dotx(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
301
++{
302
++	struct p9_client *clnt = fid->clnt;
303
++	struct p9_req_t *req = NULL;
304
++	int total = 0;
305
++	int count = iov_iter_count(to);
306
++	int rsize;
307
++	size_t off;
308
++
309
++	int i, maxpages, npages = 0;
310
++	struct page **pages = NULL;
311
++	unsigned long *ppns = NULL;
312
++
313
++	*err = 0;
314
++
315
++	maxpages = DIV_ROUND_UP(count, PAGE_SIZE);
316
++	maxpages = min(DOTX_ZC_MAXPAGES, maxpages);
317
++
318
++	pages = kmalloc(sizeof(struct page *) * maxpages, GFP_KERNEL);
319
++	if (!pages) {
320
++		*err = -ENOMEM;
321
++		goto error;
322
++	}
323
++	ppns = kmalloc(sizeof(unsigned long) * maxpages, GFP_KERNEL);
324
++	if (!ppns) {
325
++		*err = -ENOMEM;
326
++		goto error;
327
++	}
328
++
329
++	rsize = iov_iter_get_pages(to, pages, count, maxpages, &off);
330
++	if (rsize < 0) {
331
++		*err = rsize;
332
++		goto error;
333
++	}
334
++
335
++	npages = DIV_ROUND_UP(off + rsize, PAGE_SIZE);
336
++	for (i = 0; i < npages; i++) {
337
++		ppns[i] = page_to_pfn(pages[i]);
338
++	}
339
++
340
++	req = p9_client_rpc(clnt, P9_TREADX, "dqddp", fid->fid, offset, rsize,
341
++			    (unsigned int) off, npages, ppns);
342
++	if (IS_ERR(req)) {
343
++		p9_debug(P9_DEBUG_9P, "          rpc error\n");
344
++		*err = PTR_ERR(req);
345
++		goto error;
346
++	}
347
++
348
++	*err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
349
++	if (*err) {
350
++		p9_debug(P9_DEBUG_9P, "          ret err %d \n", *err);
351
++		trace_9p_protocol_dump(clnt, req->rc);
352
++		goto error;
353
++	}
354
++	if (rsize < count) {
355
++		pr_err("bogus RREADX count (%d > %d)\n", count, rsize);
356
++		goto error;
357
++	}
358
++
359
++	if (count == 0)
360
++		goto error;
361
++
362
++	iov_iter_advance(to, count);
363
++	total = count;
364
++
365
++error:
366
++	if (req && !IS_ERR(req))
367
++		p9_free_req(clnt, req);
368
++
369
++	for (i = 0; i < npages; i++) {
370
++		put_page(pages[i]);
371
++	}
372
++
373
++	kfree(pages);
374
++	kfree(ppns);
375
++
376
++	return total;
377
++}
378
++
379
+ int
380
+ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
381
+ {
382
+-	struct p9_client *clnt = fid->clnt;
383
+-	struct p9_req_t *req;
384
+ 	int total = 0;
385
++	int is_dotx = p9_is_proto_dotx(fid->clnt);
386
++
387
+ 	*err = 0;
388
+ 
389
+-	p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
390
+-		   fid->fid, (unsigned long long) offset, (int)iov_iter_count(to));
391
++	p9_debug(P9_DEBUG_9P, ">>> TREAD  fid %d offset %llu count %zd\n",
392
++		 fid->fid, (unsigned long long) offset, iov_iter_count(to));
393
+ 
394
+ 	while (iov_iter_count(to)) {
395
+-		int count = iov_iter_count(to);
396
+-		int rsize, non_zc = 0;
397
+-		char *dataptr;
398
+-			
399
+-		rsize = fid->iounit;
400
+-		if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
401
+-			rsize = clnt->msize - P9_IOHDRSZ;
402
++		int count;
403
+ 
404
+-		if (count < rsize)
405
+-			rsize = count;
406
+-
407
+-		/* Don't bother zerocopy for small IO (< 1024) */
408
+-		if (clnt->trans_mod->zc_request && rsize > 1024) {
409
+-			/*
410
+-			 * response header len is 11
411
+-			 * PDU Header(7) + IO Size (4)
412
+-			 */
413
+-			req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
414
+-					       0, 11, "dqd", fid->fid,
415
+-					       offset, rsize);
416
++		if (is_dotx && dotx_can_zc(to, fid->clnt)) {
417
++			count = p9_client_read_dotx(fid, offset, to, err);
418
++			if (*err == -ENXIO) {
419
++				pr_warn("Disabling dotx: No zero copy device\n");
420
++				fid->clnt->is_dotx_ok = 0;
421
++				continue;
422
++			}
423
+ 		} else {
424
+-			non_zc = 1;
425
+-			req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
426
+-					    rsize);
427
+-		}
428
+-		if (IS_ERR(req)) {
429
+-			*err = PTR_ERR(req);
430
+-			break;
431
++			count = p9_client_read_nodotx(fid, offset, to, err);
432
++			if (is_dotx && *err == -ENXIO) {
433
++				pr_warn("Enabling dotx: Zero copy device available\n");
434
++				fid->clnt->is_dotx_ok = 1;
435
++				continue;
436
++			}
437
+ 		}
438
+ 
439
+-		*err = p9pdu_readf(req->rc, clnt->proto_version,
440
+-				   "D", &count, &dataptr);
441
+-		if (*err) {
442
+-			trace_9p_protocol_dump(clnt, req->rc);
443
+-			p9_free_req(clnt, req);
444
++		if (*err || count == 0)
445
+ 			break;
446
+-		}
447
+-		if (rsize < count) {
448
+-			pr_err("bogus RREAD count (%d > %d)\n", count, rsize);
449
+-			count = rsize;
450
+-		}
451
+ 
452
+-		p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
453
+-		if (!count) {
454
+-			p9_free_req(clnt, req);
455
+-			break;
456
+-		}
457
++		p9_debug(P9_DEBUG_9P, "<<< RREAD  count %d\n", count);
458
+ 
459
+-		if (non_zc) {
460
+-			int n = copy_to_iter(dataptr, count, to);
461
+-			total += n;
462
+-			offset += n;
463
+-			if (n != count) {
464
+-				*err = -EFAULT;
465
+-				p9_free_req(clnt, req);
466
+-				break;
467
+-			}
468
+-		} else {
469
+-			iov_iter_advance(to, count);
470
+-			total += count;
471
+-			offset += count;
472
+-		}
473
+-		p9_free_req(clnt, req);
474
++		total += count;
475
++		offset += count;
476
+ 	}
477
+ 	return total;
478
+ }
479
+ EXPORT_SYMBOL(p9_client_read);
480
+ 
481
++static int
482
++p9_client_write_nodotx(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
483
++{
484
++	struct p9_client *clnt = fid->clnt;
485
++	struct p9_req_t *req;
486
++	int total = 0;
487
++	int count = iov_iter_count(from);
488
++	int rsize;
489
++
490
++	*err = 0;
491
++
492
++	rsize = fid->iounit;
493
++	if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
494
++		rsize = clnt->msize - P9_IOHDRSZ;
495
++
496
++	if (count < rsize)
497
++		rsize = count;
498
++
499
++	/* Don't bother zerocopy for small IO (< 1024) */
500
++	if (clnt->trans_mod->zc_request && rsize > 1024) {
501
++		req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0, rsize,
502
++				       P9_ZC_HDR_SZ, "dqd", fid->fid, offset,
503
++				       rsize);
504
++	} else {
505
++		req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid, offset,
506
++				    rsize, from);
507
++	}
508
++	if (IS_ERR(req)) {
509
++		p9_debug(P9_DEBUG_9P, "          rpc error\n");
510
++		*err = PTR_ERR(req);
511
++		return 0;
512
++	}
513
++
514
++	*err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
515
++	if (*err) {
516
++		p9_debug(P9_DEBUG_9P, "          ret err %d \n", *err);
517
++		trace_9p_protocol_dump(clnt, req->rc);
518
++		goto error;
519
++	}
520
++	if (rsize < count) {
521
++		pr_err("bogus RWRITE count (%d > %d)\n", count, rsize);
522
++		count = rsize;
523
++	}
524
++
525
++	iov_iter_advance(from, count);
526
++	total = count;
527
++error:
528
++	p9_free_req(clnt, req);
529
++
530
++	return total;
531
++}
532
++
533
++static int
534
++p9_client_write_dotx(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
535
++{
536
++	struct p9_client *clnt = fid->clnt;
537
++	struct p9_req_t *req = NULL;
538
++	int total = 0;
539
++	int count = iov_iter_count(from);
540
++	int rsize;
541
++	size_t off;
542
++
543
++	int i, maxpages, npages = 0;
544
++	struct page **pages = NULL;
545
++	unsigned long *ppns = NULL;
546
++
547
++	*err = 0;
548
++
549
++	maxpages = DIV_ROUND_UP(count, PAGE_SIZE);
550
++	maxpages = min(DOTX_ZC_MAXPAGES, maxpages);
551
++
552
++	pages = kmalloc(sizeof(struct page *) * maxpages, GFP_KERNEL);
553
++	if (!pages) {
554
++		*err = -ENOMEM;
555
++		goto error;
556
++	}
557
++	ppns = kmalloc(sizeof(unsigned long) * maxpages, GFP_KERNEL);
558
++	if (!ppns) {
559
++		*err = -ENOMEM;
560
++		goto error;
561
++	}
562
++	rsize = iov_iter_get_pages(from, pages, count, maxpages, &off);
563
++	if (rsize < 0) {
564
++		*err = rsize;
565
++		goto error;
566
++	}
567
++
568
++	npages = DIV_ROUND_UP(off + rsize, PAGE_SIZE);
569
++	for (i = 0; i < npages; i++) {
570
++		ppns[i] = page_to_pfn(pages[i]);
571
++	}
572
++
573
++	req = p9_client_rpc(clnt, P9_TWRITEX, "dqddp", fid->fid, offset, rsize,
574
++			    (unsigned int) off, npages, ppns);
575
++	if (IS_ERR(req)) {
576
++		p9_debug(P9_DEBUG_9P, "          rpc error\n");
577
++		*err = PTR_ERR(req);
578
++		goto error;
579
++	}
580
++
581
++	*err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
582
++	if (*err) {
583
++		p9_debug(P9_DEBUG_9P, "          ret err %d \n", *err);
584
++		trace_9p_protocol_dump(clnt, req->rc);
585
++		goto error;
586
++	}
587
++	if (rsize < count) {
588
++		pr_err("bogus RWRITEX count (%d > %d)\n", count, rsize);
589
++		goto error;
590
++	}
591
++
592
++	iov_iter_advance(from, count);
593
++	total = count;
594
++error:
595
++	if (req && !IS_ERR(req))
596
++		p9_free_req(clnt, req);
597
++
598
++	for (i = 0; i < npages; i++) {
599
++		put_page(pages[i]);
600
++	}
601
++
602
++	kfree(pages);
603
++	kfree(ppns);
604
++
605
++	return total;
606
++}
607
++
608
+ int
609
+ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
610
+ {
611
+-	struct p9_client *clnt = fid->clnt;
612
+-	struct p9_req_t *req;
613
+ 	int total = 0;
614
++	int is_dotx = p9_is_proto_dotx(fid->clnt);
615
++
616
+ 	*err = 0;
617
+ 
618
+ 	p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n",
619
+-				fid->fid, (unsigned long long) offset,
620
+-				iov_iter_count(from));
621
++		 fid->fid, (unsigned long long) offset, iov_iter_count(from));
622
+ 
623
+ 	while (iov_iter_count(from)) {
624
+-		int count = iov_iter_count(from);
625
+-		int rsize = fid->iounit;
626
+-		if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
627
+-			rsize = clnt->msize - P9_IOHDRSZ;
628
++		int count;
629
+ 
630
+-		if (count < rsize)
631
+-			rsize = count;
632
+-
633
+-		/* Don't bother zerocopy for small IO (< 1024) */
634
+-		if (clnt->trans_mod->zc_request && rsize > 1024) {
635
+-			req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0,
636
+-					       rsize, P9_ZC_HDR_SZ, "dqd",
637
+-					       fid->fid, offset, rsize);
638
++		if (is_dotx && dotx_can_zc(from, fid->clnt)) {
639
++			count = p9_client_write_dotx(fid, offset, from, err);
640
++			if (*err == -ENXIO) {
641
++				pr_warn("Disabling dotx: No zero copy device\n");
642
++				fid->clnt->is_dotx_ok = 0;
643
++				continue;
644
++			}
645
+ 		} else {
646
+-			req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
647
+-						    offset, rsize, from);
648
+-		}
649
+-		if (IS_ERR(req)) {
650
+-			*err = PTR_ERR(req);
651
+-			break;
652
++			count = p9_client_write_nodotx(fid, offset, from, err);
653
++			if (is_dotx && *err == -ENXIO) {
654
++				pr_warn("Enabling dotx: Zero copy device available\n");
655
++				fid->clnt->is_dotx_ok = 1;
656
++				continue;
657
++			}
658
+ 		}
659
+ 
660
+-		*err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
661
+-		if (*err) {
662
+-			trace_9p_protocol_dump(clnt, req->rc);
663
+-			p9_free_req(clnt, req);
664
++		if (*err || count == 0)
665
+ 			break;
666
+-		}
667
+-		if (rsize < count) {
668
+-			pr_err("bogus RWRITE count (%d > %d)\n", count, rsize);
669
+-			count = rsize;
670
+-		}
671
+ 
672
+ 		p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
673
+ 
674
+-		p9_free_req(clnt, req);
675
+-		iov_iter_advance(from, count);
676
+ 		total += count;
677
+ 		offset += count;
678
+ 	}
679
+@@ -1803,7 +2079,8 @@ static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
680
+ 		ret += strlen(wst->muid);
681
+ 
682
+ 	if ((proto_version == p9_proto_2000u) ||
683
+-		(proto_version == p9_proto_2000L)) {
684
++		(proto_version == p9_proto_2000L) ||
685
++		(proto_version == p9_proto_2000X)) {
686
+ 		ret += 2+4+4+4;	/* extension[s] n_uid[4] n_gid[4] n_muid[4] */
687
+ 		if (wst->extension)
688
+ 			ret += strlen(wst->extension);
689
+diff --git a/net/9p/protocol.c b/net/9p/protocol.c
690
+index 145f805..62b0cf9 100644
691
+--- a/net/9p/protocol.c
692
+@@ -346,7 +346,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
693
+ 			break;
694
+ 		case '?':
695
+ 			if ((proto_version != p9_proto_2000u) &&
696
+-				(proto_version != p9_proto_2000L))
697
++				(proto_version != p9_proto_2000L) &&
698
++				(proto_version != p9_proto_2000X))
699
+ 				return 0;
700
+ 			break;
701
+ 		default:
702
+@@ -454,6 +455,20 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
703
+ 					errcode = -EFAULT;
704
+ 			}
705
+ 			break;
706
++		case 'p':{
707
++				int j;
708
++				uint32_t npages = va_arg(ap, uint32_t);
709
++				uint64_t *ppns = va_arg(ap, uint64_t *);
710
++				errcode =
711
++				    p9pdu_writef(pdu, proto_version, "d",
712
++								 npages);
713
++				for (j = 0; !errcode && j < npages; j++) {
714
++					errcode =
715
++					    p9pdu_writef(pdu, proto_version,
716
++							 "q", ppns[j]);
717
++				}
718
++			}
719
++			break;
720
+ 		case 'T':{
721
+ 				uint16_t nwname = va_arg(ap, int);
722
+ 				const char **wnames = va_arg(ap, const char **);
723
+@@ -516,7 +531,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
724
+ 			break;
725
+ 		case '?':
726
+ 			if ((proto_version != p9_proto_2000u) &&
727
+-				(proto_version != p9_proto_2000L))
728
++				(proto_version != p9_proto_2000L) &&
729
++				(proto_version != p9_proto_2000X))
730
+ 				return 0;
731
+ 			break;
732
+ 		default:
733
+-- 
734
+2.6.2
735
+