forked from Minki/linux
759aaa10c7
Add reset hook for sdm845 based platforms to turn off the wait-for-safe sequence. Understanding how wait-for-safe logic affects USB and UFS performance on MTP845 and DB845 boards: Qcom's implementation of arm,mmu-500 adds a WAIT-FOR-SAFE logic to address under-performance issues in real-time clients, such as Display, and Camera. On receiving an invalidation requests, the SMMU forwards SAFE request to these clients and waits for SAFE ack signal from real-time clients. The SAFE signal from such clients is used to qualify the start of invalidation. This logic is controlled by chicken bits, one for each - MDP (display), IFE0, and IFE1 (camera), that can be accessed only from secure software on sdm845. This configuration, however, degrades the performance of non-real time clients, such as USB, and UFS etc. This happens because, with wait-for-safe logic enabled the hardware tries to throttle non-real time clients while waiting for SAFE ack signals from real-time clients. On mtp845 and db845 devices, with wait-for-safe logic enabled by the bootloaders we see degraded performance of USB and UFS when kernel enables the smmu stage-1 translations for these clients. Turn off this wait-for-safe logic from the kernel gets us back the perf of USB and UFS devices until we re-visit this when we start seeing perf issues on display/camera on upstream supported SDM845 platforms. The bootloaders on these boards implement secure monitor callbacks to handle a specific command - QCOM_SCM_SVC_SMMU_PROGRAM with which the logic can be toggled. There are other boards such as cheza whose bootloaders don't enable this logic. Such boards don't implement callbacks to handle the specific SCM call so disabling this logic for such boards will be a no-op. This change is inspired by the downstream change from Patrick Daly to address performance issues with display and camera by handling this wait-for-safe within separte io-pagetable ops to do TLB maintenance. So a big thanks to him for the change and for all the offline discussions. Without this change the UFS reads are pretty slow: $ time dd if=/dev/sda of=/dev/zero bs=1048576 count=10 conv=sync 10+0 records in 10+0 records out 10485760 bytes (10.0MB) copied, 22.394903 seconds, 457.2KB/s real 0m 22.39s user 0m 0.00s sys 0m 0.01s With this change they are back to rock! $ time dd if=/dev/sda of=/dev/zero bs=1048576 count=300 conv=sync 300+0 records in 300+0 records out 314572800 bytes (300.0MB) copied, 1.030541 seconds, 291.1MB/s real 0m 1.03s user 0m 0.00s sys 0m 0.54s Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Reviewed-by: Stephen Boyd <swboyd@chromium.org> Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org> Signed-off-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org> Signed-off-by: Will Deacon <will@kernel.org>
178 lines
4.4 KiB
C
178 lines
4.4 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
// Miscellaneous Arm SMMU implementation and integration quirks
|
|
// Copyright (C) 2019 Arm Limited
|
|
|
|
#define pr_fmt(fmt) "arm-smmu: " fmt
|
|
|
|
#include <linux/bitfield.h>
|
|
#include <linux/of.h>
|
|
|
|
#include "arm-smmu.h"
|
|
|
|
|
|
static int arm_smmu_gr0_ns(int offset)
|
|
{
|
|
switch(offset) {
|
|
case ARM_SMMU_GR0_sCR0:
|
|
case ARM_SMMU_GR0_sACR:
|
|
case ARM_SMMU_GR0_sGFSR:
|
|
case ARM_SMMU_GR0_sGFSYNR0:
|
|
case ARM_SMMU_GR0_sGFSYNR1:
|
|
case ARM_SMMU_GR0_sGFSYNR2:
|
|
return offset + 0x400;
|
|
default:
|
|
return offset;
|
|
}
|
|
}
|
|
|
|
static u32 arm_smmu_read_ns(struct arm_smmu_device *smmu, int page,
|
|
int offset)
|
|
{
|
|
if (page == ARM_SMMU_GR0)
|
|
offset = arm_smmu_gr0_ns(offset);
|
|
return readl_relaxed(arm_smmu_page(smmu, page) + offset);
|
|
}
|
|
|
|
static void arm_smmu_write_ns(struct arm_smmu_device *smmu, int page,
|
|
int offset, u32 val)
|
|
{
|
|
if (page == ARM_SMMU_GR0)
|
|
offset = arm_smmu_gr0_ns(offset);
|
|
writel_relaxed(val, arm_smmu_page(smmu, page) + offset);
|
|
}
|
|
|
|
/* Since we don't care for sGFAR, we can do without 64-bit accessors */
|
|
static const struct arm_smmu_impl calxeda_impl = {
|
|
.read_reg = arm_smmu_read_ns,
|
|
.write_reg = arm_smmu_write_ns,
|
|
};
|
|
|
|
|
|
struct cavium_smmu {
|
|
struct arm_smmu_device smmu;
|
|
u32 id_base;
|
|
};
|
|
|
|
static int cavium_cfg_probe(struct arm_smmu_device *smmu)
|
|
{
|
|
static atomic_t context_count = ATOMIC_INIT(0);
|
|
struct cavium_smmu *cs = container_of(smmu, struct cavium_smmu, smmu);
|
|
/*
|
|
* Cavium CN88xx erratum #27704.
|
|
* Ensure ASID and VMID allocation is unique across all SMMUs in
|
|
* the system.
|
|
*/
|
|
cs->id_base = atomic_fetch_add(smmu->num_context_banks, &context_count);
|
|
dev_notice(smmu->dev, "\tenabling workaround for Cavium erratum 27704\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int cavium_init_context(struct arm_smmu_domain *smmu_domain)
|
|
{
|
|
struct cavium_smmu *cs = container_of(smmu_domain->smmu,
|
|
struct cavium_smmu, smmu);
|
|
|
|
if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2)
|
|
smmu_domain->cfg.vmid += cs->id_base;
|
|
else
|
|
smmu_domain->cfg.asid += cs->id_base;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static const struct arm_smmu_impl cavium_impl = {
|
|
.cfg_probe = cavium_cfg_probe,
|
|
.init_context = cavium_init_context,
|
|
};
|
|
|
|
static struct arm_smmu_device *cavium_smmu_impl_init(struct arm_smmu_device *smmu)
|
|
{
|
|
struct cavium_smmu *cs;
|
|
|
|
cs = devm_kzalloc(smmu->dev, sizeof(*cs), GFP_KERNEL);
|
|
if (!cs)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
cs->smmu = *smmu;
|
|
cs->smmu.impl = &cavium_impl;
|
|
|
|
devm_kfree(smmu->dev, smmu);
|
|
|
|
return &cs->smmu;
|
|
}
|
|
|
|
|
|
#define ARM_MMU500_ACTLR_CPRE (1 << 1)
|
|
|
|
#define ARM_MMU500_ACR_CACHE_LOCK (1 << 26)
|
|
#define ARM_MMU500_ACR_S2CRB_TLBEN (1 << 10)
|
|
#define ARM_MMU500_ACR_SMTNMB_TLBEN (1 << 8)
|
|
|
|
int arm_mmu500_reset(struct arm_smmu_device *smmu)
|
|
{
|
|
u32 reg, major;
|
|
int i;
|
|
/*
|
|
* On MMU-500 r2p0 onwards we need to clear ACR.CACHE_LOCK before
|
|
* writes to the context bank ACTLRs will stick. And we just hope that
|
|
* Secure has also cleared SACR.CACHE_LOCK for this to take effect...
|
|
*/
|
|
reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID7);
|
|
major = FIELD_GET(ID7_MAJOR, reg);
|
|
reg = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_sACR);
|
|
if (major >= 2)
|
|
reg &= ~ARM_MMU500_ACR_CACHE_LOCK;
|
|
/*
|
|
* Allow unmatched Stream IDs to allocate bypass
|
|
* TLB entries for reduced latency.
|
|
*/
|
|
reg |= ARM_MMU500_ACR_SMTNMB_TLBEN | ARM_MMU500_ACR_S2CRB_TLBEN;
|
|
arm_smmu_gr0_write(smmu, ARM_SMMU_GR0_sACR, reg);
|
|
|
|
/*
|
|
* Disable MMU-500's not-particularly-beneficial next-page
|
|
* prefetcher for the sake of errata #841119 and #826419.
|
|
*/
|
|
for (i = 0; i < smmu->num_context_banks; ++i) {
|
|
reg = arm_smmu_cb_read(smmu, i, ARM_SMMU_CB_ACTLR);
|
|
reg &= ~ARM_MMU500_ACTLR_CPRE;
|
|
arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_ACTLR, reg);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static const struct arm_smmu_impl arm_mmu500_impl = {
|
|
.reset = arm_mmu500_reset,
|
|
};
|
|
|
|
|
|
struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
|
|
{
|
|
/*
|
|
* We will inevitably have to combine model-specific implementation
|
|
* quirks with platform-specific integration quirks, but everything
|
|
* we currently support happens to work out as straightforward
|
|
* mutually-exclusive assignments.
|
|
*/
|
|
switch (smmu->model) {
|
|
case ARM_MMU500:
|
|
smmu->impl = &arm_mmu500_impl;
|
|
break;
|
|
case CAVIUM_SMMUV2:
|
|
return cavium_smmu_impl_init(smmu);
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (of_property_read_bool(smmu->dev->of_node,
|
|
"calxeda,smmu-secure-config-access"))
|
|
smmu->impl = &calxeda_impl;
|
|
|
|
if (of_device_is_compatible(smmu->dev->of_node, "qcom,sdm845-smmu-500"))
|
|
return qcom_smmu_impl_init(smmu);
|
|
|
|
return smmu;
|
|
}
|