The effective_affinity_mask is always set when an interrupt is assigned in
__assign_irq_vector() -> apic->cpu_mask_to_apicid(), e.g. for struct apic
apic_physflat: -> default_cpu_mask_to_apicid() ->
irq_data_update_effective_affinity(), but it looks d->common->affinity
remains all-1's before the user space or the kernel changes it later.
In the early allocation/initialization phase of an IRQ, we should use the
effective_affinity_mask, otherwise Hyper-V may not deliver the interrupt to
the expected CPU.
Change-Id: If1d9f53846dcb281779764aee85d790588493bdb
Reviewed-on: http://photon-jenkins.eng.vmware.com:8082/6568
Tested-by: gerrit-photon <photon-checkins@vmware.com>
Reviewed-by: Srivatsa S. Bhat <srivatsab@vmware.com>
| 1 | 1 |
deleted file mode 100644 |
| ... | ... |
@@ -1,47 +0,0 @@ |
| 1 |
-NVME driver allocates admin and IO vector using pci_alloc_irq_vectors(). |
|
| 2 |
- |
|
| 3 |
-In v4.9, if IO vector allocated using PCI_IRQ_AFFINITY, |
|
| 4 |
-then assignment of IO vector in vector_irq mismatches with actual assignment. |
|
| 5 |
-Due to which some IO INT fails to handle. |
|
| 6 |
- |
|
| 7 |
-If IO vector allocated without using PCI_IRQ_AFFINITY, |
|
| 8 |
-then all IO INT scheduled on CPU 0 and no IO INT fails. |
|
| 9 |
- |
|
| 10 |
-This problem only observed with Azure HyperV NVME, |
|
| 11 |
-so skipping PCI_IRQ_AFFINITY only in this case. |
|
| 12 |
- |
|
| 13 |
-diff -Nurp linux-4.9.140_modified/drivers/nvme/host/pci.c linux-4.9.140/drivers/nvme/host/pci.c |
|
| 14 |
-+++ linux-4.9.140/drivers/nvme/host/pci.c 2019-01-17 01:27:30.002509017 +0530 |
|
| 15 |
-@@ -1450,8 +1450,18 @@ static int nvme_setup_io_queues(struct n |
|
| 16 |
- * setting up the full range we need. |
|
| 17 |
- */ |
|
| 18 |
- pci_free_irq_vectors(pdev); |
|
| 19 |
-- nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues, |
|
| 20 |
-- PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY); |
|
| 21 |
-+ |
|
| 22 |
-+ if (pdev->vendor == PCI_VENDOR_ID_MICROSOFT && |
|
| 23 |
-+ pdev->device == PCI_DEVICE_ID_MICROSOFT_NVME) {
|
|
| 24 |
-+ nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues, |
|
| 25 |
-+ PCI_IRQ_ALL_TYPES); |
|
| 26 |
-+ dev_warn(dev->ctrl.device, "detected MicroSoft NVMe controller, " |
|
| 27 |
-+ "skipping PCI_IRQ_AFFINITY from pci_alloc_irq_vectors\n"); |
|
| 28 |
-+ } else {
|
|
| 29 |
-+ nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues, |
|
| 30 |
-+ PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY); |
|
| 31 |
-+ } |
|
| 32 |
-+ |
|
| 33 |
- if (nr_io_queues <= 0) |
|
| 34 |
- return -EIO; |
|
| 35 |
- dev->max_qid = nr_io_queues; |
|
| 36 |
-diff -Nurp linux-4.9.140_modified/include/linux/pci_ids.h linux-4.9.140/include/linux/pci_ids.h |
|
| 37 |
-+++ linux-4.9.140/include/linux/pci_ids.h 2019-01-17 01:10:34.606536165 +0530 |
|
| 38 |
-@@ -3056,4 +3056,7 @@ |
|
| 39 |
- |
|
| 40 |
- #define PCI_VENDOR_ID_NCUBE 0x10ff |
|
| 41 |
- |
|
| 42 |
-+#define PCI_VENDOR_ID_MICROSOFT 0x1414 |
|
| 43 |
-+#define PCI_DEVICE_ID_MICROSOFT_NVME 0xb111 |
|
| 44 |
-+ |
|
| 45 |
- #endif /* _LINUX_PCI_IDS_H */ |
| 46 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,72 @@ |
| 0 |
+From ad8b5f060aef55a55156253d93c6ac0071c6019d Mon Sep 17 00:00:00 2001 |
|
| 1 |
+From: Dexuan Cui <decui@microsoft.com> |
|
| 2 |
+Date: Wed, 1 Nov 2017 20:30:53 +0000 |
|
| 3 |
+Subject: [PATCH 137/156] PCI: hv: Use effective affinity mask |
|
| 4 |
+ |
|
| 5 |
+The effective_affinity_mask is always set when an interrupt is assigned in |
|
| 6 |
+__assign_irq_vector() -> apic->cpu_mask_to_apicid(), e.g. for struct apic |
|
| 7 |
+apic_physflat: -> default_cpu_mask_to_apicid() -> |
|
| 8 |
+irq_data_update_effective_affinity(), but it looks d->common->affinity |
|
| 9 |
+remains all-1's before the user space or the kernel changes it later. |
|
| 10 |
+ |
|
| 11 |
+In the early allocation/initialization phase of an IRQ, we should use the |
|
| 12 |
+effective_affinity_mask, otherwise Hyper-V may not deliver the interrupt to |
|
| 13 |
+the expected CPU. Without the patch, if we assign 7 Mellanox ConnectX-3 |
|
| 14 |
+VFs to a 32-vCPU VM, one of the VFs may fail to receive interrupts. |
|
| 15 |
+ |
|
| 16 |
+Signed-off-by: Dexuan Cui <decui@microsoft.com> |
|
| 17 |
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> |
|
| 18 |
+Reviewed-by: Jake Oshins <jakeo@microsoft.com> |
|
| 19 |
+Cc: stable@vger.kernel.org |
|
| 20 |
+Cc: Jork Loeser <jloeser@microsoft.com> |
|
| 21 |
+Cc: Stephen Hemminger <sthemmin@microsoft.com> |
|
| 22 |
+Cc: K. Y. Srinivasan <kys@microsoft.com> |
|
| 23 |
+Signed-off-by: Ajay Kaher <akaher@vmware.com> |
|
| 24 |
+--- |
|
| 25 |
+ drivers/pci/host/pci-hyperv.c | 8 +++++--- |
|
| 26 |
+ 1 file changed, 5 insertions(+), 3 deletions(-) |
|
| 27 |
+ |
|
| 28 |
+--- a/drivers/pci/host/pci-hyperv.c 2017-12-05 16:55:53.278586020 -0800 |
|
| 29 |
+@@ -878,7 +878,7 @@ static void hv_irq_unmask(struct irq_dat |
|
| 30 |
+ int cpu; |
|
| 31 |
+ u64 res; |
|
| 32 |
+ |
|
| 33 |
+- dest = irq_data_get_affinity_mask(data); |
|
| 34 |
++ dest = irq_data_get_effective_affinity_mask(data); |
|
| 35 |
+ pdev = msi_desc_to_pci_dev(msi_desc); |
|
| 36 |
+ pbus = pdev->bus; |
|
| 37 |
+ hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); |
|
| 38 |
+@@ -1041,6 +1041,7 @@ static void hv_compose_msi_msg(struct ir |
|
| 39 |
+ struct hv_pci_dev *hpdev; |
|
| 40 |
+ struct pci_bus *pbus; |
|
| 41 |
+ struct pci_dev *pdev; |
|
| 42 |
++ struct cpumask *dest; |
|
| 43 |
+ struct compose_comp_ctxt comp; |
|
| 44 |
+ struct tran_int_desc *int_desc; |
|
| 45 |
+ struct {
|
|
| 46 |
+@@ -1055,6 +1056,7 @@ static void hv_compose_msi_msg(struct ir |
|
| 47 |
+ int ret; |
|
| 48 |
+ |
|
| 49 |
+ pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); |
|
| 50 |
++ dest = irq_data_get_effective_affinity_mask(data); |
|
| 51 |
+ pbus = pdev->bus; |
|
| 52 |
+ hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); |
|
| 53 |
+ hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); |
|
| 54 |
+@@ -1080,14 +1082,14 @@ static void hv_compose_msi_msg(struct ir |
|
| 55 |
+ switch (pci_protocol_version) {
|
|
| 56 |
+ case PCI_PROTOCOL_VERSION_1_1: |
|
| 57 |
+ size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, |
|
| 58 |
+- irq_data_get_affinity_mask(data), |
|
| 59 |
++ dest, |
|
| 60 |
+ hpdev->desc.win_slot.slot, |
|
| 61 |
+ cfg->vector); |
|
| 62 |
+ break; |
|
| 63 |
+ |
|
| 64 |
+ case PCI_PROTOCOL_VERSION_1_2: |
|
| 65 |
+ size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, |
|
| 66 |
+- irq_data_get_affinity_mask(data), |
|
| 67 |
++ dest, |
|
| 68 |
+ hpdev->desc.win_slot.slot, |
|
| 69 |
+ cfg->vector); |
|
| 70 |
+ break; |
| 0 | 71 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,54 @@ |
| 0 |
+From eba61d21b1ad88f9d4f63b5844f17da1dc8d8930 Mon Sep 17 00:00:00 2001 |
|
| 1 |
+From: Dexuan Cui <decui@microsoft.com> |
|
| 2 |
+Date: Mon, 29 Oct 2018 23:18:48 -0700 |
|
| 3 |
+Subject: [PATCH] x86/irq: implement irq_data_get_effective_affinity_mask() for |
|
| 4 |
+ v4.12.14 |
|
| 5 |
+ |
|
| 6 |
+See __assign_irq_vector(): |
|
| 7 |
+ cpumask_copy(d->domain, vector_cpumask); |
|
| 8 |
+ |
|
| 9 |
+The function is required by Hyper-V pci-hyperv driver. See |
|
| 10 |
+79aa801e8994 ("PCI: hv: Use effective affinity mask")
|
|
| 11 |
+ |
|
| 12 |
+Signed-off-by: Dexuan Cui <decui@microsoft.com> |
|
| 13 |
+Signed-off-by: Ajay Kaher <akaher@vmware.com> |
|
| 14 |
+--- |
|
| 15 |
+ arch/x86/kernel/apic/vector.c | 8 ++++++++ |
|
| 16 |
+ include/linux/irq.h | 2 ++ |
|
| 17 |
+ 2 files changed, 10 insertions(+) |
|
| 18 |
+ |
|
| 19 |
+diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c |
|
| 20 |
+index 26c3769..627985a 100644 |
|
| 21 |
+--- a/arch/x86/kernel/apic/vector.c |
|
| 22 |
+@@ -69,6 +69,14 @@ struct irq_cfg *irqd_cfg(struct irq_data *irq_data) |
|
| 23 |
+ } |
|
| 24 |
+ EXPORT_SYMBOL_GPL(irqd_cfg); |
|
| 25 |
+ |
|
| 26 |
++struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) |
|
| 27 |
++{
|
|
| 28 |
++ struct apic_chip_data *data = apic_chip_data(d); |
|
| 29 |
++ |
|
| 30 |
++ return data ? data->domain : NULL; |
|
| 31 |
++} |
|
| 32 |
++EXPORT_SYMBOL_GPL(irq_data_get_effective_affinity_mask); |
|
| 33 |
++ |
|
| 34 |
+ struct irq_cfg *irq_cfg(unsigned int irq) |
|
| 35 |
+ {
|
|
| 36 |
+ return irqd_cfg(irq_get_irq_data(irq)); |
|
| 37 |
+diff --git a/include/linux/irq.h b/include/linux/irq.h |
|
| 38 |
+index 45b037e..a3c8c0d 100644 |
|
| 39 |
+--- a/include/linux/irq.h |
|
| 40 |
+@@ -695,6 +695,8 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) |
|
| 41 |
+ return d->common->affinity; |
|
| 42 |
+ } |
|
| 43 |
+ |
|
| 44 |
++struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d); |
|
| 45 |
++ |
|
| 46 |
+ unsigned int arch_dynirq_lower_bound(unsigned int from); |
|
| 47 |
+ |
|
| 48 |
+ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, |
|
| 49 |
+-- |
|
| 50 |
+2.7.4 |
|
| 51 |
+ |
| ... | ... |
@@ -2,7 +2,7 @@ |
| 2 | 2 |
Summary: Kernel |
| 3 | 3 |
Name: linux |
| 4 | 4 |
Version: 4.9.140 |
| 5 |
-Release: 5%{?kat_build:.%kat_build}%{?dist}
|
|
| 5 |
+Release: 6%{?kat_build:.%kat_build}%{?dist}
|
|
| 6 | 6 |
License: GPLv2 |
| 7 | 7 |
URL: http://www.kernel.org/ |
| 8 | 8 |
Group: System Environment/Kernel |
| ... | ... |
@@ -71,13 +71,13 @@ Patch46: 0001-xfs-move-inode-fork-verifiers-to-xfs-dinode-verify.patch |
| 71 | 71 |
Patch47: 0002-xfs-verify-dinode-header-first.patch |
| 72 | 72 |
Patch48: 0003-xfs-enhance-dinode-verifier.patch |
| 73 | 73 |
|
| 74 |
-#HyperV PCI patches |
|
| 74 |
+# HyperV PCI patches to Use vPCI_protocol_version_1.2 |
|
| 75 | 75 |
Patch51: 0001_PCI_hv_Allocate_physically_contiguous_hypercall_params_buffer.patch |
| 76 | 76 |
Patch52: 0002_PCI_hv_Add_vPCI_version_protocol_negotiation.patch |
| 77 | 77 |
Patch53: 0003_PCI_hv_Use_vPCI_protocol_version_1.2_v4.9.patch |
| 78 |
- |
|
| 79 |
-# NVME PCI patch |
|
| 80 |
-Patch61: 0001_nvme_io_irq_without_affinity.patch |
|
| 78 |
+# HyperV PCI patches to solve IRQ no handler problem |
|
| 79 |
+Patch54: 0004-PCI-hv-Use-effective-affinity-mask.patch |
|
| 80 |
+Patch55: 0005-x86-irq-implement-irq_data_get_effective_affinity.patch |
|
| 81 | 81 |
|
| 82 | 82 |
# Out-of-tree patches from AppArmor: |
| 83 | 83 |
Patch71: 0001-UBUNTU-SAUCE-AppArmor-basic-networking-rules.patch |
| ... | ... |
@@ -201,7 +201,8 @@ This package contains the 'perf' performance analysis tools for Linux kernel. |
| 201 | 201 |
%patch51 -p1 |
| 202 | 202 |
%patch52 -p1 |
| 203 | 203 |
%patch53 -p1 |
| 204 |
-%patch61 -p1 |
|
| 204 |
+%patch54 -p1 |
|
| 205 |
+%patch55 -p1 |
|
| 205 | 206 |
|
| 206 | 207 |
%patch71 -p1 |
| 207 | 208 |
%patch72 -p1 |
| ... | ... |
@@ -376,6 +377,9 @@ ln -sf %{name}-%{uname_r}.cfg /boot/photon.cfg
|
| 376 | 376 |
/usr/share/doc/* |
| 377 | 377 |
|
| 378 | 378 |
%changelog |
| 379 |
+* Wed Jan 23 2019 Ajay Kaher <akaher@vmware.com> 4.9.140-6 |
|
| 380 |
+- Fix IRQ issue by using effective_affinity |
|
| 381 |
+- Remove nvme_io_irq_without_affinity.patch |
|
| 379 | 382 |
* Thu Jan 17 2019 Ajay Kaher <akaher@vmware.com> 4.9.140-5 |
| 380 | 383 |
- Fix IRQ issues with NVMe on Azure. |
| 381 | 384 |
* Tue Jan 15 2019 Alexey Makhalov <amakhalov@vmware.com> 4.9.140-4 |