From b7b54446d4170f77e66cc606e91bae2d013adb07 Mon Sep 17 00:00:00 2001
From: Kamal Charan <kcharan@vmware.com>
Date: Sat, 4 Apr 2020 08:22:29 -0700
Subject: [PATCH 03/10] Add 9p zero copy data path using crossfd

The patch combines 2 patches from photon3:
i.e. 0001-Add-9p-zero-copy-data-path-using-crossfd.patch
0001-Calculate-zerocopy-pages-with-considering-buffer-ali.patch

The 2nd patch only modifies client_dotx.[ch], so we
merge them to reduce the diff chunks.

Commit message for the 1st commit:
Add new rpc calls to 9p protocol to allow zero copy
read and write requests from the client to the server.
The client sends only the page addresses of the io
buffer where the data is to be transferred, and the
server directly does the data transfer to that buffer,
and does not send or receive the data over the transport.

These calls are allowed only over vsock transport,
where both the client and the server are on same host,
and server can transfer the data to client memory using
the crossfd feature of ESXi.

Commit message for the 2nd commit:
Author: Mounesh Badiger <badigerm@vmware.com>
Date:   Mon Mar 2 10:44:32 2020 -0800
Calculate zerocopy pages with considering buffer alignment

Max number of zerocopy pages are calculated manually without
considering input buffer alignment. this is causing 9p to send
multiple read/write request for io with non-page aligned buffers.

This change fixes issues with using iov_iter_npages() which consider
iovector considers bufer alignment and returns number of pages.

Changes made during porting from 4.19.y to 6.1.y:
Author: Albert Guo <aguo@vmware.com>
Date: Mon Oct 10 2023
1) Moved dotx specific functions to client_dotx.c to reduce the
number of diff chunks.
Also applied modifications on client.c to client_dotx.c manually.
Some function names in the commit message are also changed accordingly.
2) Renamed p9_client_write_dotx() to p9_client_write_once_dotx() so that
the naming pattern aligns with p9_client_read_once_dotx()
3) Duplicated code from p9_client_write_generic() to
p9_client_write_once_generic() so as to minimize change to original code.
4) Rename upstream implementation of p9_client_read to p9_client_read_generic
Rename upstream implementation of p9_client_write to p9_client_write_generic
5) Create a new function p9_client_read() and p9_client_write() which will be
redirected to either xxx_generic() or xxx_dotx() function.
6) Make p9_client_read_once() zero copy aware as it can be called from 9p
module directly.
7)Changed implementation of dotx_can_zc():
iter_is_iovec(iter) =>
(!iov_iter_is_kvec(iter) && !iov_iter_is_discard(iter))
8) iov_iter_get_pages2() will advance the iov, we're now need to use
iov_iter_revert() to fix the iov on error cases. p9_client_write() also use
the same mechanism.
9) Retry forever if the error is ENXIO in p9_client_read_once()
Ocationally the DOM owner may transfer for multiple times, as a result,
ENXIO may be returned for multiple times for the same IO. ENXIO is
an indication that zero copy IO is sent for remote object or
the first no zero copy IO for a local object is sent to VDFS proxy.
In this case, retrying twice won't be enough.
Now we retry for ever if the errno is ENXIO. The behavior is the same
as that in photon3.
---
 include/net/9p/9p.h     |   8 +
 include/net/9p/client.h |   6 +
 net/9p/client.c         |  66 +++++--
 net/9p/client_dotx.c    | 388 ++++++++++++++++++++++++++++++++++++++++
 net/9p/client_dotx.h    |  32 ++++
 net/9p/protocol.c       |  20 ++-
 6 files changed, 506 insertions(+), 14 deletions(-)
 create mode 100644 net/9p/client_dotx.c
 create mode 100644 net/9p/client_dotx.h

diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 13abe013a..935216129 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -92,6 +92,10 @@ void _p9_debug(enum p9_debug_flags level, const char *func,
  * @P9_RREAD: response with data requested
  * @P9_TWRITE: reuqest to transfer data to a file
  * @P9_RWRITE: response with out much data was transferred to file
+ * @P9_TREADX: request to zero copy data from a file to user buffer
+ * @P9_RREADX: response with how much data was transferred from file
+ * @P9_TWRITEX: reuqest to zero copy data to a file from user buffer
+ * @P9_RWRITEX: response with how much data was transferred to file
  * @P9_TCLUNK: forget about a handle to an entity within the file system
  * @P9_RCLUNK: response when server has forgotten about the handle
  * @P9_TREMOVE: request to remove an entity from the hierarchy
@@ -151,6 +155,10 @@ enum p9_msg_t {
 	P9_RRENAMEAT,
 	P9_TUNLINKAT = 76,
 	P9_RUNLINKAT,
+	P9_TREADX = 96,
+	P9_RREADX,
+	P9_TWRITEX = 98,
+	P9_RWRITEX,
 	P9_TVERSION = 100,
 	P9_RVERSION,
 	P9_TAUTH = 102,
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 78ebcf782..e4beaac3d 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -20,12 +20,14 @@
  * @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u
  * @p9_proto_2000u: 9P2000.u extension
  * @p9_proto_2000L: 9P2000.L extension
+ * @p9_proto_2000X: 9P2000.X extension      // dotx zero copy
  */
 
 enum p9_proto_versions {
 	p9_proto_legacy,
 	p9_proto_2000u,
 	p9_proto_2000L,
+	p9_proto_2000X,
 };
 
 
@@ -88,6 +90,8 @@ struct p9_req_t {
  * struct p9_client - per client instance state
  * @lock: protect @fids and @reqs
  * @msize: maximum data size negotiated by protocol
+ * @minzcpages: minimum number of pages for dotx zero copy
+ * @is_dotx_ok: whether zero copy io using dotx can be used
  * @proto_version: 9P protocol version to use
  * @trans_mod: module API instantiated with this client
  * @status: connection state
@@ -102,6 +106,8 @@ struct p9_req_t {
 struct p9_client {
 	spinlock_t lock;
 	unsigned int msize;
+	unsigned int minzcpages;
+	unsigned int is_dotx_ok;
 	unsigned char proto_version;
 	struct p9_trans_module *trans_mod;
 	enum p9_trans_status status;
diff --git a/net/9p/client.c b/net/9p/client.c
index af59c3f2e..ba8bc4dfc 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -24,6 +24,7 @@
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
 #include "protocol.h"
+#include "client_dotx.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/9p.h>
@@ -39,6 +40,7 @@ enum {
 	Opt_trans,
 	Opt_legacy,
 	Opt_version,
+	Opt_minzcpages,
 	Opt_err,
 };
 
@@ -47,12 +49,14 @@ static const match_table_t tokens = {
 	{Opt_legacy, "noextend"},
 	{Opt_trans, "trans=%s"},
 	{Opt_version, "version=%s"},
+	{Opt_minzcpages, "minzcpages=%d"},
 	{Opt_err, NULL},
 };
 
 inline int p9_is_proto_dotl(struct p9_client *clnt)
 {
-	return clnt->proto_version == p9_proto_2000L;
+	return clnt->proto_version == p9_proto_2000L ||
+	       clnt->proto_version == p9_proto_2000X;
 }
 EXPORT_SYMBOL(p9_is_proto_dotl);
 
@@ -112,6 +116,9 @@ static int get_protocol_version(char *s)
 	} else if (!strcmp(s, "9p2000.L")) {
 		version = p9_proto_2000L;
 		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n");
+	} else if (!strcmp(s, "9p2000.X")) {
+		version = p9_proto_2000X;
+		p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.X\n");
 	} else {
 		pr_info("Unknown protocol version %s\n", s);
 	}
@@ -191,6 +198,16 @@ static int parse_opts(char *opts, struct p9_client *clnt)
 		case Opt_legacy:
 			clnt->proto_version = p9_proto_legacy;
 			break;
+		case Opt_minzcpages:
+			r = match_int(&args[0], &option);
+			if (r < 0 || option < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no/negative integer?\n");
+				ret = r;
+				continue;
+			}
+			clnt->minzcpages = option;
+			break;
 		case Opt_version:
 			s = match_strdup(&args[0]);
 			if (!s) {
@@ -210,6 +227,16 @@ static int parse_opts(char *opts, struct p9_client *clnt)
 			continue;
 		}
 	}
+	if (p9_is_proto_dotx(clnt) &&
+	    (!clnt->trans_mod ||
+	     strcmp(clnt->trans_mod->name, DOTX_ZC_TRANSPORT))) {
+		p9_debug(P9_DEBUG_ERROR,
+			 "dotx version requires %s transport",
+			 DOTX_ZC_TRANSPORT);
+		ret = -EINVAL;
+		goto free_and_return;
+	}
+
 
 free_and_return:
 	if (ret)
@@ -904,6 +931,10 @@ static int p9_client_version(struct p9_client *c)
 		 c->msize, c->proto_version);
 
 	switch (c->proto_version) {
+	case p9_proto_2000X:
+		req = p9_client_rpc(c, P9_TVERSION, "ds",
+					c->msize, "9P2000.X");
+		break;
 	case p9_proto_2000L:
 		req = p9_client_rpc(c, P9_TVERSION, "ds",
 				    c->msize, "9P2000.L");
@@ -931,7 +962,9 @@ static int p9_client_version(struct p9_client *c)
 	}
 
 	p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
-	if (!strncmp(version, "9P2000.L", 8)) {
+	if (!strncmp(version, "9P2000.X", 8)) {
+		c->proto_version = p9_proto_2000X;
+	} else if (!strncmp(version, "9P2000.L", 8)) {
 		c->proto_version = p9_proto_2000L;
 	} else if (!strncmp(version, "9P2000.u", 8)) {
 		c->proto_version = p9_proto_2000u;
@@ -974,6 +1007,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
 	clnt->trans_mod = NULL;
 	clnt->trans = NULL;
 	clnt->fcall_cache = NULL;
+        clnt->is_dotx_ok = 0;
+        clnt->minzcpages = 1;
 
 	client_id = utsname()->nodename;
 	memcpy(clnt->name, client_id, strlen(client_id) + 1);
@@ -1505,8 +1540,11 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
 }
 EXPORT_SYMBOL(p9_client_unlinkat);
 
-int
-p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
+static int
+p9_client_read_once_generic(struct p9_fid *fid, u64 offset, struct iov_iter *to,
+			    int *err);
+static int
+p9_client_read_generic(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
 {
 	int total = 0;
 	*err = 0;
@@ -1514,7 +1552,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
 	while (iov_iter_count(to)) {
 		int count;
 
-		count = p9_client_read_once(fid, offset, to, err);
+		count = p9_client_read_once_generic(fid, offset, to, err);
 		if (!count || *err)
 			break;
 		offset += count;
@@ -1524,9 +1562,9 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
 }
 EXPORT_SYMBOL(p9_client_read);
 
-int
-p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to,
-		    int *err)
+static int
+p9_client_read_once_generic(struct p9_fid *fid, u64 offset, struct iov_iter *to,
+			    int *err)
 {
 	struct p9_client *clnt = fid->clnt;
 	struct p9_req_t *req;
@@ -1579,7 +1617,7 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to,
 		received = rsize;
 	}
 
-	p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
+	p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", received);
 
 	if (non_zc) {
 		int n = copy_to_iter(dataptr, received, to);
@@ -1597,8 +1635,8 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to,
 }
 EXPORT_SYMBOL(p9_client_read_once);
 
-int
-p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
+static int
+p9_client_write_generic(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
 {
 	struct p9_client *clnt = fid->clnt;
 	struct p9_req_t *req;
@@ -1791,7 +1829,8 @@ static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
 		ret += strlen(wst->muid);
 
 	if (proto_version == p9_proto_2000u ||
-	    proto_version == p9_proto_2000L) {
+	    (proto_version == p9_proto_2000L) ||
+	    (proto_version == p9_proto_2000X)) {
 		/* extension[s] n_uid[4] n_gid[4] n_muid[4] */
 		ret += 2 + 4 + 4 + 4;
 		if (wst->extension)
@@ -2272,3 +2311,6 @@ void __exit p9_client_exit(void)
 {
 	kmem_cache_destroy(p9_req_cache);
 }
+
+#include "client_dotx.c"
+
diff --git a/net/9p/client_dotx.c b/net/9p/client_dotx.c
new file mode 100644
index 000000000..774f41d38
--- /dev/null
+++ b/net/9p/client_dotx.c
@@ -0,0 +1,388 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *  Zero copy support for VDFS (VMware Distributed File System)
+ *
+ *  Copyright (C) 2020 by Kamal Charan <kcharan@vmware.com>
+ *  Copyright (C) 2023 by Albert Guo <aguo@vmware.com>
+ *
+ */
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#define DOTX_ZC_MAXPAGES 1024
+
+
+static int
+p9_client_read_once_dotx(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
+{
+       struct p9_client *clnt = fid->clnt;
+       struct p9_req_t *req = NULL;
+       int received = 0;
+       int count = iov_iter_count(to);
+       int rsize;
+       size_t off;
+
+       int i, maxpages;
+       struct page **pages = NULL;
+       unsigned long *ppns = NULL;
+
+       *err = 0;
+
+       maxpages = iov_iter_npages(to, DOTX_ZC_MAXPAGES);
+
+       pages = kmalloc(sizeof(struct page *) * maxpages, GFP_KERNEL);
+       if (!pages) {
+               *err = -ENOMEM;
+               goto error;
+       }
+       ppns = kmalloc(sizeof(unsigned long) * maxpages, GFP_KERNEL);
+       if (!ppns) {
+               *err = -ENOMEM;
+               goto error;
+       }
+
+       rsize = iov_iter_get_pages2(to, pages, count, maxpages, &off);
+       if (rsize < 0) {
+               *err = rsize;
+               goto error;
+       }
+
+       for (i = 0; i < maxpages; i++) {
+               ppns[i] = page_to_pfn(pages[i]);
+       }
+
+       req = p9_client_rpc(clnt, P9_TREADX, "dqddp", fid->fid, offset, rsize,
+                           (unsigned int) off, maxpages, ppns);
+       if (IS_ERR(req)) {
+               p9_debug(P9_DEBUG_9P, "          rpc error\n");
+               *err = PTR_ERR(req);
+               goto put_pages;
+       }
+
+       *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &received);
+       if (*err) {
+	       received = 0;
+               p9_debug(P9_DEBUG_9P, "          ret err %d \n", *err);
+               trace_9p_protocol_dump(clnt, &req->rc);
+               goto put_pages;
+       }
+       if (rsize < received) {
+               pr_err("bogus RREADX count (%d > %d)\n", received, rsize);
+	       received = rsize;
+               goto put_pages;
+       }
+
+       p9_debug(P9_DEBUG_9P, "<<< DOTX RREAD  count %d\n", received);
+
+       if (received == 0) {
+               goto put_pages;
+       }
+
+put_pages:
+       if (req && !IS_ERR(req)) {
+               p9_req_put(clnt, req);
+       }
+
+       for (i = 0; i < maxpages; i++) {
+               put_page(pages[i]);
+       }
+
+error:
+       kfree(pages);
+       kfree(ppns);
+
+       /* In normal cases, this is a non-op as
+	* received == count - iov_iter_count(to)
+	* When some error like -ENXIO is returned, received is 0,
+	* but the iter of iov may have been changed by
+	* iov_iter_get_pages2(), we need to revert the iter in
+	* this case.
+	*/
+       iov_iter_revert(to, count - received - iov_iter_count(to));
+       return received;
+}
+
+/*
+ * In kernel 6.1.y, p9_client_read_once() is an exported symbol which
+ * is called from vfs_file.c for O_NONBLOCK mode, so we should make this
+ * function zero copy aware.
+ */
+
+int
+p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to,
+		    int *err)
+{
+	int count, retry = 0;
+	bool is_dotx = p9_is_proto_dotx(fid->clnt);
+	/*
+	 * The zero copy state (is_dotx_ok) may change for multiple
+	 * times, so we keep retrying if we receive ENXIO error.
+         */
+	while (++retry < 100) {
+		if (is_dotx && dotx_can_zc(to, fid->clnt)) {
+			count = p9_client_read_once_dotx(fid, offset, to, err);
+			if (*err == -ENXIO) {
+				pr_warn("%s: Disabling dotx: No zero copy "
+					"device\n", __func__);
+				fid->clnt->is_dotx_ok = 0;
+				continue;
+			} else {
+				break;
+			}
+		} else {
+			count = p9_client_read_once_generic(fid, offset, to, err);
+			if (is_dotx && *err == -ENXIO) {
+				pr_warn("%s: Enabling dotx: Zero copy device "
+					"available\n", __func__);
+				fid->clnt->is_dotx_ok = 1;
+				continue;
+			} else {
+				break;
+			}
+		}
+	}
+	return count;
+}
+
+/*
+ * This function is only called when protocol version is dotx.
+ */
+static int
+p9_client_read_dotx(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
+{
+	int total = 0;
+	*err = 0;
+
+	p9_debug(P9_DEBUG_9P, ">>> DOTX TREAD fid %d offset %llu count %zd\n",
+		 fid->fid, (unsigned long long) offset, iov_iter_count(to));
+
+	while (iov_iter_count(to)) {
+		int count;
+
+		if (dotx_can_zc(to, fid->clnt)) {
+			count = p9_client_read_once_dotx(fid, offset, to, err);
+			if (*err == -ENXIO) {
+				pr_warn("%s: Disabling dotx: No zero copy "
+					"device\n", __func__);
+				fid->clnt->is_dotx_ok = 0;
+				continue;
+			}
+		} else {
+			count = p9_client_read_once_generic(fid, offset, to, err);
+			if (*err == -ENXIO) {
+				pr_warn("%s: Enabling dotx: Zero copy device "
+					"available\n", __func__);
+				fid->clnt->is_dotx_ok = 1;
+				continue;
+			}
+		}
+
+		if (*err || count == 0) {
+			break;
+		}
+
+
+		total += count;
+		offset += count;
+	}
+	return total;
+}
+
+/*
+ * Note: The implementation p9_client_write_once is extracted (copied) from
+ * p9_client_write_generic(). This is to keep minmial change in the original
+ * code to avoid further merge conflict.
+ * Any change in the p9_client_write_generic() in the upstream should be applied
+ * to this function.
+ */
+
+static int
+p9_client_write_once_generic(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
+{
+	struct p9_client *clnt = fid->clnt;
+	struct p9_req_t *req;
+	int count = iov_iter_count(from);
+	int rsize = fid->iounit;
+	int written = 0;
+
+	if (!rsize || rsize > clnt->msize - P9_IOHDRSZ)
+		rsize = clnt->msize - P9_IOHDRSZ;
+
+	if (count < rsize)
+		rsize = count;
+
+	/* Don't bother zerocopy for small IO (< 1024) */
+	if (clnt->trans_mod->zc_request && rsize > 1024) {
+		req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0,
+				       rsize, P9_ZC_HDR_SZ, "dqd",
+				       fid->fid, offset, rsize);
+	} else {
+		req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
+				    offset, rsize, from);
+	}
+	if (IS_ERR(req)) {
+		iov_iter_revert(from, count - iov_iter_count(from));
+		*err = PTR_ERR(req);
+		return 0;
+	}
+
+	*err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written);
+	if (*err) {
+		iov_iter_revert(from, count - iov_iter_count(from));
+		trace_9p_protocol_dump(clnt, &req->rc);
+		p9_req_put(clnt, req);
+		return 0;
+	}
+	if (rsize < written) {
+		pr_err("bogus RWRITE count (%d > %d)\n", written, rsize);
+		written = rsize;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
+
+	p9_req_put(clnt, req);
+	iov_iter_revert(from, count - written - iov_iter_count(from));
+	return written;
+}
+
+static int
+p9_client_write_once_dotx(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
+{
+       struct p9_client *clnt = fid->clnt;
+       struct p9_req_t *req = NULL;
+       int written = 0;
+       int count = iov_iter_count(from);
+       int rsize;
+       size_t off;
+
+       int i, maxpages;
+       struct page **pages = NULL;
+       unsigned long *ppns = NULL;
+
+       *err = 0;
+
+       maxpages = iov_iter_npages(from, DOTX_ZC_MAXPAGES);
+
+       pages = kmalloc(sizeof(struct page *) * maxpages, GFP_KERNEL);
+       if (!pages) {
+               *err = -ENOMEM;
+               goto error;
+       }
+       ppns = kmalloc(sizeof(unsigned long) * maxpages, GFP_KERNEL);
+       if (!ppns) {
+               *err = -ENOMEM;
+               goto error;
+       }
+       rsize = iov_iter_get_pages2(from, pages, count, maxpages, &off);
+       if (rsize < 0) {
+               *err = rsize;
+               goto error;
+       }
+
+       for (i = 0; i < maxpages; i++) {
+               ppns[i] = page_to_pfn(pages[i]);
+       }
+
+       req = p9_client_rpc(clnt, P9_TWRITEX, "dqddp", fid->fid, offset, rsize,
+                           (unsigned int) off, maxpages, ppns);
+       if (IS_ERR(req)) {
+               p9_debug(P9_DEBUG_9P, "          rpc error\n");
+               *err = PTR_ERR(req);
+               goto put_pages;
+       }
+       *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written);
+       if (*err) {
+	       written = 0;
+               p9_debug(P9_DEBUG_9P, "          ret err %d \n", *err);
+               trace_9p_protocol_dump(clnt, &req->rc);
+               goto put_pages;
+       }
+       if (rsize < written) {
+               pr_err("bogus RWRITEX count (%d > %d)\n", written, rsize);
+	       written = rsize;
+               goto put_pages;
+       }
+
+       p9_debug(P9_DEBUG_9P, "<<< DOTX RWRITE count %d\n", written);
+put_pages:
+       if (req && !IS_ERR(req))
+               p9_req_put(clnt, req);
+
+       for (i = 0; i < maxpages; i++) {
+               put_page(pages[i]);
+       }
+error:
+       kfree(pages);
+       kfree(ppns);
+
+       /* In normal cases, this is a non-op as
+	* written == count - iov_iter_count(to)
+	* When some error like -ENXIO is returned, written is 0,
+	* but the iter of iov may have been changed by
+	* iov_iter_get_pages2(), we need to revert the iter in
+	* this case.
+	*/
+       iov_iter_revert(from, count - written - iov_iter_count(from));
+       return written;
+}
+
+/*
+ * This function is only called when protocol version is dotx
+ */
+int
+p9_client_write_dotx(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
+{
+	int total = 0;
+	*err = 0;
+
+	p9_debug(P9_DEBUG_9P, ">>> DOTX TWRITE fid %d offset %llu count %zd\n",
+		 fid->fid, offset, iov_iter_count(from));
+
+	while (iov_iter_count(from)) {
+		int count;
+		if (dotx_can_zc(from, fid->clnt)) {
+			count = p9_client_write_once_dotx(fid, offset, from, err);
+			if (*err == -ENXIO) {
+				pr_warn("%s: Disabling dotx: No zero copy "
+					"device\n", __func__);
+				fid->clnt->is_dotx_ok = 0;
+				continue;
+			}
+		} else {
+			count = p9_client_write_once_generic(fid, offset, from, err);
+			if (*err == -ENXIO) {
+				pr_warn("%s: Enabling dotx: Zero copy device "
+					"available\n", __func__);
+				fid->clnt->is_dotx_ok = 1;
+				continue;
+			}
+		}
+		if (*err || count == 0) {
+			break;
+		}
+
+		total += count;
+		offset += count;
+	}
+	return total;
+}
+
+int
+p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
+{
+	if (p9_is_proto_dotx(fid->clnt)) {
+		return p9_client_read_dotx(fid, offset, to, err);
+	} else {
+		return p9_client_read_generic(fid, offset, to, err);
+	}
+}
+
+int
+p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
+{
+	if (p9_is_proto_dotx(fid->clnt)) {
+		return p9_client_write_dotx(fid, offset, from, err);
+	} else {
+		return p9_client_write_generic(fid, offset, from, err);
+	}
+}
diff --git a/net/9p/client_dotx.h b/net/9p/client_dotx.h
new file mode 100644
index 000000000..727917cfe
--- /dev/null
+++ b/net/9p/client_dotx.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *  Zero copy enhancement for VDFS (VMware Distributed File System)
+ *
+ *  Copyright (C) 2020 by Kamal Charan <kcharan@vmware.com>
+ *  Copyright (C) 2023 by Albert Guo <aguo@vmware.com>
+ *
+ */
+
+#ifndef CLIENT_DOTX_H
+#define CLIENT_DOTX_H
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+
+#define DOTX_ZC_TRANSPORT "vsock"
+
+static inline int
+p9_is_proto_dotx(struct p9_client *clnt)
+{
+	return clnt->proto_version == p9_proto_2000X;
+}
+
+static inline int
+dotx_can_zc(const struct iov_iter *iter, struct p9_client *clnt)
+{
+       return (!iov_iter_is_kvec(iter) && !iov_iter_is_discard(iter)) &&
+	       clnt->is_dotx_ok &&
+	       iov_iter_count(iter) >= clnt->minzcpages * PAGE_SIZE;
+}
+
+#endif /* CLIENT_DOTX_H */
+
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 4e3a2a1ff..a1702cef3 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -498,7 +498,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
 			break;
 		case '?':
 			if ((proto_version != p9_proto_2000u) &&
-				(proto_version != p9_proto_2000L))
+				(proto_version != p9_proto_2000L) &&
+				(proto_version != p9_proto_2000X))
 				return 0;
 			break;
 		default:
@@ -606,6 +607,20 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
 					errcode = -EFAULT;
 			}
 			break;
+		case 'p':{
+				int j;
+				uint32_t npages = va_arg(ap, uint32_t);
+				uint64_t *ppns = va_arg(ap, uint64_t *);
+				errcode =
+				    p9pdu_writef(pdu, proto_version, "d",
+								 npages);
+				for (j = 0; !errcode && j < npages; j++) {
+					errcode =
+					    p9pdu_writef(pdu, proto_version,
+							 "q", ppns[j]);
+				}
+			}
+			break;
 		case 'T':{
 				uint16_t nwname = va_arg(ap, int);
 				const char **wnames = va_arg(ap, const char **);
@@ -668,7 +683,8 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
 			break;
 		case '?':
 			if ((proto_version != p9_proto_2000u) &&
-				(proto_version != p9_proto_2000L))
+				(proto_version != p9_proto_2000L) &&
+				(proto_version != p9_proto_2000X))
 				return 0;
 			break;
 		default:
-- 
2.39.0