Browse code

linux kernel: Fix HyperV IRQ issue using effective_affinity

The effective_affinity_mask is always set when an interrupt is assigned in
__assign_irq_vector() -> apic->cpu_mask_to_apicid(), e.g. for struct apic
apic_physflat: -> default_cpu_mask_to_apicid() ->
irq_data_update_effective_affinity(), but it looks d->common->affinity
remains all-1's before the user space or the kernel changes it later.

In the early allocation/initialization phase of an IRQ, we should use the
effective_affinity_mask, otherwise Hyper-V may not deliver the interrupt to
the expected CPU.

Change-Id: If1d9f53846dcb281779764aee85d790588493bdb
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/6568
Tested-by: gerrit-photon <photon-checkins@vmware.com>
Reviewed-by: Srivatsa S. Bhat <srivatsab@vmware.com>

Ajay Kaher authored on 2019/01/24 06:19:06
Showing 4 changed files
1 1
deleted file mode 100644
... ...
@@ -1,47 +0,0 @@
1
-NVME driver allocates admin and IO vector using pci_alloc_irq_vectors(). 
2
-
3
-In v4.9, if IO vector allocated using PCI_IRQ_AFFINITY, 
4
-then assignment of IO vector in vector_irq mismatches with actual assignment. 
5
-Due to which some IO INT fails to handle.
6
-
7
-If IO vector allocated without using PCI_IRQ_AFFINITY, 
8
-then all IO INT scheduled on CPU 0 and no IO INT fails.
9
-
10
-This problem only observed with Azure HyperV NVME,
11
-so skipping PCI_IRQ_AFFINITY only in this case.
12
-
13
-diff -Nurp linux-4.9.140_modified/drivers/nvme/host/pci.c linux-4.9.140/drivers/nvme/host/pci.c
14
-+++ linux-4.9.140/drivers/nvme/host/pci.c	2019-01-17 01:27:30.002509017 +0530
15
-@@ -1450,8 +1450,18 @@ static int nvme_setup_io_queues(struct n
16
- 	 * setting up the full range we need.
17
- 	 */
18
- 	pci_free_irq_vectors(pdev);
19
--	nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
20
--			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);
21
-+
22
-+	if (pdev->vendor == PCI_VENDOR_ID_MICROSOFT &&
23
-+			pdev->device == PCI_DEVICE_ID_MICROSOFT_NVME) {
24
-+		nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
25
-+				PCI_IRQ_ALL_TYPES);
26
-+		dev_warn(dev->ctrl.device, "detected MicroSoft NVMe controller, "
27
-+			"skipping PCI_IRQ_AFFINITY from pci_alloc_irq_vectors\n");
28
-+	} else {
29
-+		nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
30
-+				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);
31
-+	}
32
-+
33
- 	if (nr_io_queues <= 0)
34
- 		return -EIO;
35
- 	dev->max_qid = nr_io_queues;
36
-diff -Nurp linux-4.9.140_modified/include/linux/pci_ids.h linux-4.9.140/include/linux/pci_ids.h
37
-+++ linux-4.9.140/include/linux/pci_ids.h	2019-01-17 01:10:34.606536165 +0530
38
-@@ -3056,4 +3056,7 @@
39
- 
40
- #define PCI_VENDOR_ID_NCUBE		0x10ff
41
- 
42
-+#define PCI_VENDOR_ID_MICROSOFT		0x1414
43
-+#define PCI_DEVICE_ID_MICROSOFT_NVME		0xb111
44
-+
45
- #endif /* _LINUX_PCI_IDS_H */
46 1
new file mode 100644
... ...
@@ -0,0 +1,72 @@
0
+From ad8b5f060aef55a55156253d93c6ac0071c6019d Mon Sep 17 00:00:00 2001
1
+From: Dexuan Cui <decui@microsoft.com>
2
+Date: Wed, 1 Nov 2017 20:30:53 +0000
3
+Subject: [PATCH 137/156] PCI: hv: Use effective affinity mask
4
+
5
+The effective_affinity_mask is always set when an interrupt is assigned in
6
+__assign_irq_vector() -> apic->cpu_mask_to_apicid(), e.g. for struct apic
7
+apic_physflat: -> default_cpu_mask_to_apicid() ->
8
+irq_data_update_effective_affinity(), but it looks d->common->affinity
9
+remains all-1's before the user space or the kernel changes it later.
10
+
11
+In the early allocation/initialization phase of an IRQ, we should use the
12
+effective_affinity_mask, otherwise Hyper-V may not deliver the interrupt to
13
+the expected CPU.  Without the patch, if we assign 7 Mellanox ConnectX-3
14
+VFs to a 32-vCPU VM, one of the VFs may fail to receive interrupts.
15
+
16
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
17
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
18
+Reviewed-by: Jake Oshins <jakeo@microsoft.com>
19
+Cc: stable@vger.kernel.org
20
+Cc: Jork Loeser <jloeser@microsoft.com>
21
+Cc: Stephen Hemminger <sthemmin@microsoft.com>
22
+Cc: K. Y. Srinivasan <kys@microsoft.com>
23
+Signed-off-by: Ajay Kaher <akaher@vmware.com>
24
+---
25
+ drivers/pci/host/pci-hyperv.c | 8 +++++---
26
+ 1 file changed, 5 insertions(+), 3 deletions(-)
27
+
28
+--- a/drivers/pci/host/pci-hyperv.c	2017-12-05 16:55:53.278586020 -0800
29
+@@ -878,7 +878,7 @@ static void hv_irq_unmask(struct irq_dat
30
+ 	int cpu;
31
+ 	u64 res;
32
+ 
33
+-	dest = irq_data_get_affinity_mask(data);
34
++	dest = irq_data_get_effective_affinity_mask(data);
35
+ 	pdev = msi_desc_to_pci_dev(msi_desc);
36
+ 	pbus = pdev->bus;
37
+ 	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
38
+@@ -1041,6 +1041,7 @@ static void hv_compose_msi_msg(struct ir
39
+ 	struct hv_pci_dev *hpdev;
40
+ 	struct pci_bus *pbus;
41
+ 	struct pci_dev *pdev;
42
++	struct cpumask *dest;
43
+ 	struct compose_comp_ctxt comp;
44
+ 	struct tran_int_desc *int_desc;
45
+ 	struct {
46
+@@ -1055,6 +1056,7 @@ static void hv_compose_msi_msg(struct ir
47
+ 	int ret;
48
+ 
49
+ 	pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
50
++	dest = irq_data_get_effective_affinity_mask(data);
51
+ 	pbus = pdev->bus;
52
+ 	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
53
+ 	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
54
+@@ -1080,14 +1082,14 @@ static void hv_compose_msi_msg(struct ir
55
+ 	switch (pci_protocol_version) {
56
+ 	case PCI_PROTOCOL_VERSION_1_1:
57
+ 		size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
58
+-					irq_data_get_affinity_mask(data),
59
++					dest,
60
+ 					hpdev->desc.win_slot.slot,
61
+ 					cfg->vector);
62
+ 		break;
63
+ 
64
+ 	case PCI_PROTOCOL_VERSION_1_2:
65
+ 		size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
66
+-					irq_data_get_affinity_mask(data),
67
++					dest,
68
+ 					hpdev->desc.win_slot.slot,
69
+ 					cfg->vector);
70
+ 		break;
0 71
new file mode 100644
... ...
@@ -0,0 +1,54 @@
0
+From eba61d21b1ad88f9d4f63b5844f17da1dc8d8930 Mon Sep 17 00:00:00 2001
1
+From: Dexuan Cui <decui@microsoft.com>
2
+Date: Mon, 29 Oct 2018 23:18:48 -0700
3
+Subject: [PATCH] x86/irq: implement irq_data_get_effective_affinity_mask() for
4
+ v4.12.14
5
+
6
+See __assign_irq_vector():
7
+	cpumask_copy(d->domain, vector_cpumask);
8
+
9
+The function is required by Hyper-V pci-hyperv driver. See
10
+79aa801e8994 ("PCI: hv: Use effective affinity mask")
11
+
12
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
13
+Signed-off-by: Ajay Kaher <akaher@vmware.com>
14
+---
15
+ arch/x86/kernel/apic/vector.c | 8 ++++++++
16
+ include/linux/irq.h           | 2 ++
17
+ 2 files changed, 10 insertions(+)
18
+
19
+diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
20
+index 26c3769..627985a 100644
21
+--- a/arch/x86/kernel/apic/vector.c
22
+@@ -69,6 +69,14 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
23
+ }
24
+ EXPORT_SYMBOL_GPL(irqd_cfg);
25
+ 
26
++struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
27
++{
28
++	struct apic_chip_data *data = apic_chip_data(d);
29
++
30
++	return data ? data->domain : NULL;
31
++}
32
++EXPORT_SYMBOL_GPL(irq_data_get_effective_affinity_mask);
33
++
34
+ struct irq_cfg *irq_cfg(unsigned int irq)
35
+ {
36
+ 	return irqd_cfg(irq_get_irq_data(irq));
37
+diff --git a/include/linux/irq.h b/include/linux/irq.h
38
+index 45b037e..a3c8c0d 100644
39
+--- a/include/linux/irq.h
40
+@@ -695,6 +695,8 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
41
+ 	return d->common->affinity;
42
+ }
43
+ 
44
++struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d);
45
++
46
+ unsigned int arch_dynirq_lower_bound(unsigned int from);
47
+ 
48
+ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
49
+-- 
50
+2.7.4
51
+
... ...
@@ -2,7 +2,7 @@
2 2
 Summary:        Kernel
3 3
 Name:           linux
4 4
 Version:        4.9.140
5
-Release:        5%{?kat_build:.%kat_build}%{?dist}
5
+Release:        6%{?kat_build:.%kat_build}%{?dist}
6 6
 License:    	GPLv2
7 7
 URL:        	http://www.kernel.org/
8 8
 Group:        	System Environment/Kernel
... ...
@@ -71,13 +71,13 @@ Patch46:        0001-xfs-move-inode-fork-verifiers-to-xfs-dinode-verify.patch
71 71
 Patch47:        0002-xfs-verify-dinode-header-first.patch
72 72
 Patch48:        0003-xfs-enhance-dinode-verifier.patch
73 73
 
74
-#HyperV PCI patches
74
+# HyperV PCI patches to Use vPCI_protocol_version_1.2
75 75
 Patch51:        0001_PCI_hv_Allocate_physically_contiguous_hypercall_params_buffer.patch
76 76
 Patch52:        0002_PCI_hv_Add_vPCI_version_protocol_negotiation.patch
77 77
 Patch53:        0003_PCI_hv_Use_vPCI_protocol_version_1.2_v4.9.patch
78
-
79
-# NVME PCI patch
80
-Patch61:        0001_nvme_io_irq_without_affinity.patch
78
+# HyperV PCI patches to solve IRQ no handler problem
79
+Patch54:        0004-PCI-hv-Use-effective-affinity-mask.patch
80
+Patch55:        0005-x86-irq-implement-irq_data_get_effective_affinity.patch
81 81
 
82 82
 # Out-of-tree patches from AppArmor:
83 83
 Patch71: 0001-UBUNTU-SAUCE-AppArmor-basic-networking-rules.patch
... ...
@@ -201,7 +201,8 @@ This package contains the 'perf' performance analysis tools for Linux kernel.
201 201
 %patch51 -p1
202 202
 %patch52 -p1
203 203
 %patch53 -p1
204
-%patch61 -p1
204
+%patch54 -p1
205
+%patch55 -p1
205 206
 
206 207
 %patch71 -p1
207 208
 %patch72 -p1
... ...
@@ -376,6 +377,9 @@ ln -sf %{name}-%{uname_r}.cfg /boot/photon.cfg
376 376
 /usr/share/doc/*
377 377
 
378 378
 %changelog
379
+*   Wed Jan 23 2019 Ajay Kaher <akaher@vmware.com> 4.9.140-6
380
+-   Fix IRQ issue by using effective_affinity
381
+-   Remove nvme_io_irq_without_affinity.patch
379 382
 *   Thu Jan 17 2019 Ajay Kaher <akaher@vmware.com> 4.9.140-5
380 383
 -   Fix IRQ issues with NVMe on Azure.
381 384
 *   Tue Jan 15 2019 Alexey Makhalov <amakhalov@vmware.com> 4.9.140-4