diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index baf2886d4d6e..0c72f1897063 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index); extern int pnv_npu2_init(struct pci_controller *hose); extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, unsigned long msr); +extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev); #endif /* __ASM_POWERPC_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index ec52b970cefa..102983207734 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -327,31 +327,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = { .unset_window = pnv_npu_unset_window, .take_ownership = pnv_npu_take_ownership, }; - -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) -{ - struct pnv_phb *phb = npe->phb; - struct pci_bus *pbus = phb->hose->bus; - struct pci_dev *npdev, *gpdev = NULL, *gptmp; - struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - - if (!gpe || !gpdev) - return NULL; - - npe->table_group.ops = &pnv_pci_npu_ops; - - list_for_each_entry(npdev, &pbus->devices, bus_list) { - gptmp = pnv_pci_get_gpu_dev(npdev); - - if (gptmp != gpdev) - continue; - - pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev)); - iommu_group_add_device(gpe->table_group.group, &npdev->dev); - } - - return gpe; -} #endif /* !CONFIG_IOMMU_API */ /* @@ -359,6 +334,17 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) */ /* Maximum possible number of ATSD MMIO registers per NPU */ #define NV_NMMU_ATSD_REGS 8 +#define NV_NPU_MAX_PE_NUM 16 + +/* + * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or + * up to 3 x (GPU + 2xNPUs) (POWER9). + */ +struct npu_comp { + struct iommu_table_group table_group; + int pe_num; + struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM]; +}; /* An NPU descriptor, valid for POWER9 only */ struct npu { @@ -371,8 +357,263 @@ struct npu { /* Do we need to explicitly flush the nest mmu? */ bool nmmu_flush; + + struct npu_comp npucomp; }; +#ifdef CONFIG_IOMMU_API +static long pnv_npu_peers_create_table_userspace( + struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + if (!npucomp->pe_num || !npucomp->pe[0] || + !npucomp->pe[0]->table_group.ops || + !npucomp->pe[0]->table_group.ops->create_table) + return -EFAULT; + + return npucomp->pe[0]->table_group.ops->create_table( + &npucomp->pe[0]->table_group, num, page_shift, + window_size, levels, ptbl); +} + +static long pnv_npu_peers_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + int i, j; + long ret = 0; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + if (!pe->table_group.ops->set_window) + continue; + + ret = pe->table_group.ops->set_window(&pe->table_group, + num, tbl); + if (ret) + break; + } + + if (ret) { + for (j = 0; j < i; ++j) { + struct pnv_ioda_pe *pe = npucomp->pe[j]; + + if (!pe->table_group.ops->unset_window) + continue; + + ret = pe->table_group.ops->unset_window( + &pe->table_group, num); + if (ret) + break; + } + } else { + table_group->tables[num] = iommu_tce_table_get(tbl); + } + + return ret; +} + +static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group, + int num) +{ + int i, j; + long ret = 0; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + WARN_ON(npucomp->table_group.tables[num] != + table_group->tables[num]); + if (!npucomp->table_group.tables[num]) + continue; + + if (!pe->table_group.ops->unset_window) + continue; + + ret = pe->table_group.ops->unset_window(&pe->table_group, num); + if (ret) + break; + } + + if (ret) { + for (j = 0; j < i; ++j) { + struct pnv_ioda_pe *pe = npucomp->pe[j]; + + if (!npucomp->table_group.tables[num]) + continue; + + if (!pe->table_group.ops->set_window) + continue; + + ret = pe->table_group.ops->set_window(&pe->table_group, + num, table_group->tables[num]); + if (ret) + break; + } + } else if (table_group->tables[num]) { + iommu_tce_table_put(table_group->tables[num]); + table_group->tables[num] = NULL; + } + + return ret; +} + +static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group) +{ + int i; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + if (!pe->table_group.ops->take_ownership) + continue; + pe->table_group.ops->take_ownership(&pe->table_group); + } +} + +static void pnv_npu_peers_release_ownership( + struct iommu_table_group *table_group) +{ + int i; + struct npu_comp *npucomp = container_of(table_group, struct npu_comp, + table_group); + + for (i = 0; i < npucomp->pe_num; ++i) { + struct pnv_ioda_pe *pe = npucomp->pe[i]; + + if (!pe->table_group.ops->release_ownership) + continue; + pe->table_group.ops->release_ownership(&pe->table_group); + } +} + +static struct iommu_table_group_ops pnv_npu_peers_ops = { + .get_table_size = pnv_pci_ioda2_get_table_size, + .create_table = pnv_npu_peers_create_table_userspace, + .set_window = pnv_npu_peers_set_window, + .unset_window = pnv_npu_peers_unset_window, + .take_ownership = pnv_npu_peers_take_ownership, + .release_ownership = pnv_npu_peers_release_ownership, +}; + +static void pnv_comp_attach_table_group(struct npu_comp *npucomp, + struct pnv_ioda_pe *pe) +{ + if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM)) + return; + + npucomp->pe[npucomp->pe_num] = pe; + ++npucomp->pe_num; +} + +struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) +{ + struct iommu_table_group *table_group; + struct npu_comp *npucomp; + struct pci_dev *gpdev = NULL; + struct pci_controller *hose; + struct pci_dev *npdev = NULL; + + list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) { + npdev = pnv_pci_get_npu_dev(gpdev, 0); + if (npdev) + break; + } + + if (!npdev) + /* It is not an NPU attached device, skip */ + return NULL; + + hose = pci_bus_to_host(npdev->bus); + + if (hose->npu) { + table_group = &hose->npu->npucomp.table_group; + + if (!table_group->group) { + table_group->ops = &pnv_npu_peers_ops; + iommu_register_group(table_group, + hose->global_number, + pe->pe_number); + } + } else { + /* Create a group for 1 GPU and attached NPUs for POWER8 */ + pe->npucomp = kzalloc(sizeof(pe->npucomp), GFP_KERNEL); + table_group = &pe->npucomp->table_group; + table_group->ops = &pnv_npu_peers_ops; + iommu_register_group(table_group, hose->global_number, + pe->pe_number); + } + + /* Steal capabilities from a GPU PE */ + table_group->max_dynamic_windows_supported = + pe->table_group.max_dynamic_windows_supported; + table_group->tce32_start = pe->table_group.tce32_start; + table_group->tce32_size = pe->table_group.tce32_size; + table_group->max_levels = pe->table_group.max_levels; + if (!table_group->pgsizes) + table_group->pgsizes = pe->table_group.pgsizes; + + npucomp = container_of(table_group, struct npu_comp, table_group); + pnv_comp_attach_table_group(npucomp, pe); + + return table_group; +} + +struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) +{ + struct iommu_table_group *table_group; + struct npu_comp *npucomp; + struct pci_dev *gpdev = NULL; + struct pci_dev *npdev; + struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev); + + WARN_ON(!(pe->flags & PNV_IODA_PE_DEV)); + if (!gpe) + return NULL; + + /* + * IODA2 bridges get this set up from pci_controller_ops::setup_bridge + * but NPU bridges do not have this hook defined so we do it here. + * We do not setup other table group parameters as they won't be used + * anyway - NVLink bridges are subordinate PEs. + */ + pe->table_group.ops = &pnv_pci_npu_ops; + + table_group = iommu_group_get_iommudata( + iommu_group_get(&gpdev->dev)); + + /* + * On P9 NPU PHB and PCI PHB support different page sizes, + * keep only matching. We expect here that NVLink bridge PE pgsizes is + * initialized by the caller. + */ + table_group->pgsizes &= pe->table_group.pgsizes; + npucomp = container_of(table_group, struct npu_comp, table_group); + pnv_comp_attach_table_group(npucomp, pe); + + list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) { + struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev); + + if (gpdevtmp != gpdev) + continue; + + iommu_add_device(table_group, &npdev->dev); + } + + return table_group; +} +#endif /* CONFIG_IOMMU_API */ + /* Maximum number of nvlinks per npu */ #define NV_MAX_LINKS 6 diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 87cc10bccbfa..1d6406a051f1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -190,7 +190,8 @@ static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) unsigned int pe_num = pe->pe_number; WARN_ON(pe->pdev); - + WARN_ON(pe->npucomp); /* NPUs are not supposed to be freed */ + kfree(pe->npucomp); memset(pe, 0, sizeof(struct pnv_ioda_pe)); clear_bit(pe_num, phb->ioda.pe_alloc); } @@ -1534,7 +1535,9 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); #ifdef CONFIG_IOMMU_API -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe); +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, + struct iommu_table_group *table_group, struct pci_bus *bus); + #endif static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) { @@ -1590,7 +1593,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) pnv_pci_ioda2_setup_dma_pe(phb, pe); #ifdef CONFIG_IOMMU_API - pnv_ioda_setup_bus_iommu_group(pe); + pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL); #endif } } @@ -2552,7 +2555,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, #endif #ifdef CONFIG_IOMMU_API -static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, +unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, __u64 window_size, __u32 levels) { unsigned long bytes = 0; @@ -2626,127 +2629,40 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .release_ownership = pnv_ioda2_release_ownership, }; -static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe **ptmppe = opaque; - struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); - struct pci_dn *pdn = pci_get_pdn(pdev); - - if (!pdn || pdn->pe_number == IODA_INVALID_PE) - return 0; - - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - if (phb->type != PNV_PHB_NPU_NVLINK) - return 0; - - *ptmppe = &phb->ioda.pe_array[pdn->pe_number]; - - return 1; -} - -/* - * This returns PE of associated NPU. - * This assumes that NPU is in the same IOMMU group with GPU and there is - * no other PEs. - */ -static struct pnv_ioda_pe *gpe_table_group_to_npe( - struct iommu_table_group *table_group) -{ - struct pnv_ioda_pe *npe = NULL; - int ret = iommu_group_for_each_dev(table_group->group, &npe, - gpe_table_group_to_npe_cb); - - BUG_ON(!ret || !npe); - - return npe; -} - -static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group, - int num, struct iommu_table *tbl) -{ - struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group); - long ret = pnv_pci_ioda2_set_window(table_group, num, tbl); - - if (ret) - return ret; - - ret = npe->table_group.ops->set_window(&npe->table_group, num, tbl); - if (ret) - pnv_pci_ioda2_unset_window(table_group, num); - - return ret; -} - -static long pnv_pci_ioda2_npu_unset_window( - struct iommu_table_group *table_group, - int num) -{ - struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group); - long ret = pnv_pci_ioda2_unset_window(table_group, num); - - if (ret) - return ret; - - return npe->table_group.ops->unset_window(&npe->table_group, num); -} - -static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) -{ - struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group); - - npe->table_group.ops->take_ownership(&npe->table_group); - pnv_ioda2_take_ownership(table_group); -} - -static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { - .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table_userspace, - .set_window = pnv_pci_ioda2_npu_set_window, - .unset_window = pnv_pci_ioda2_npu_unset_window, - .take_ownership = pnv_ioda2_npu_take_ownership, - .release_ownership = pnv_ioda2_release_ownership, -}; - static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe, + struct iommu_table_group *table_group, struct pci_bus *bus) { struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - iommu_add_device(&pe->table_group, &dev->dev); + iommu_add_device(table_group, &dev->dev); if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) pnv_ioda_setup_bus_iommu_group_add_devices(pe, - dev->subordinate); + table_group, dev->subordinate); } } -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe) +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, + struct iommu_table_group *table_group, struct pci_bus *bus) { - if (!pnv_pci_ioda_pe_dma_weight(pe)) - return; - iommu_register_group(&pe->table_group, pe->phb->hose->global_number, - pe->pe_number); - - /* - * set_iommu_table_base(&pe->pdev->dev, tbl) should have been called - * by now - */ if (pe->flags & PNV_IODA_PE_DEV) - iommu_add_device(&pe->table_group, &pe->pdev->dev); - else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_iommu_group_add_devices(pe, pe->pbus); + iommu_add_device(table_group, &pe->pdev->dev); + + if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus) + pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group, + bus); } +static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); + static void pnv_pci_ioda_setup_iommu_api(void) { - struct pci_controller *hose, *tmp; + struct pci_controller *hose; struct pnv_phb *phb; - struct pnv_ioda_pe *pe, *gpe; + struct pnv_ioda_pe *pe; /* * There are 4 types of PEs: @@ -2768,24 +2684,45 @@ static void pnv_pci_ioda_setup_iommu_api(void) if (phb->type == PNV_PHB_NPU_NVLINK) continue; - list_for_each_entry(pe, &phb->ioda.pe_list, list) - pnv_ioda_setup_bus_iommu_group(pe); + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + struct iommu_table_group *table_group; + + table_group = pnv_try_setup_npu_table_group(pe); + if (!table_group) { + if (!pnv_pci_ioda_pe_dma_weight(pe)) + continue; + + table_group = &pe->table_group; + iommu_register_group(&pe->table_group, + pe->phb->hose->global_number, + pe->pe_number); + } + pnv_ioda_setup_bus_iommu_group(pe, table_group, + pe->pbus); + } } /* * Now we have all PHBs discovered, time to add NPU devices to * the corresponding IOMMU groups. */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + list_for_each_entry(hose, &hose_list, list_node) { + unsigned long pgsizes; + phb = hose->private_data; if (phb->type != PNV_PHB_NPU_NVLINK) continue; + pgsizes = pnv_ioda_parse_tce_sizes(phb); list_for_each_entry(pe, &phb->ioda.pe_list, list) { - gpe = pnv_pci_npu_setup_iommu(pe); - if (gpe) - gpe->table_group.ops = &pnv_pci_ioda2_npu_ops; + /* + * IODA2 bridges get this set up from + * pci_controller_ops::setup_bridge but NPU bridges + * do not have this hook defined so we do it here. + */ + pe->table_group.pgsizes = pgsizes; + pnv_npu_compound_attach(pe); } } } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 534b8cee081b..8e36da379252 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -62,6 +62,7 @@ struct pnv_ioda_pe { /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ struct iommu_table_group table_group; + struct npu_comp *npucomp; /* 64-bit TCE bypass region */ bool tce_bypass_enabled; @@ -199,6 +200,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, + __u64 window_size, __u32 levels); extern int pnv_eeh_post_init(void); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, @@ -214,6 +217,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); +extern struct iommu_table_group *pnv_try_setup_npu_table_group( + struct pnv_ioda_pe *pe); +extern struct iommu_table_group *pnv_npu_compound_attach( + struct pnv_ioda_pe *pe); /* pci-ioda-tce.c */ #define POWERNV_IOMMU_DEFAULT_LEVELS 1