mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
synced 2026-04-23 14:02:06 -04:00
Merge tag 'drm-next-2024-05-15' of https://gitlab.freedesktop.org/drm/kernel
Pull drm updates from Dave Airlie:
"This is the main pull request for the drm subsystems for 6.10.
In drivers the main thing is a new driver for ARM Mali firmware based
GPUs, otherwise there are a lot of changes to amdgpu/xe/i915/msm and
scattered changes to everything else.
In the core a bunch of headers and Kconfig was refactored, along with
the addition of a new panic handler which is meant to provide a user
friendly message when a panic happens and graphical display is
enabled.
New drivers:
- panthor: ARM Mali/Immortalis CSF-based GPU driver
Core:
- add a CONFIG_DRM_WERROR option
- make more headers self-contained
- grab resv lock in pin/unpin
- fix vmap resv locking
- EDID/eDP panel matching
- Kconfig cleanups
- DT sound bindings
- Add SIZE_HINTS property for cursor planes
- Add struct drm_edid_product_id and helpers.
- Use drm device based logging in more drm functions.
- drop seq_file.h from a bunch of places
- use drm_edid driver conversions
dp:
- DP Tunnel documentation
- MST read sideband cap
- Adaptive sync SDP prep work
ttm:
- improve placement for TTM BOs in idle/busy handling
panic:
- Fixes for drm-panic, and option to test it.
- Add drm panic to simpledrm, mgag200, imx, ast
bridge:
- improve init ordering
- adv7511: allow GPIO pin sharing
- tc358775: add tc358675 support
panel:
- AUO B120XAN01.0
- Samsung s6e3fa7
- BOE NT116WHM-N44
- CMN N116BCA-EA1,
- CrystalClear CMT430B19N00
- Startek KD050HDFIA020-C020A
- powertip PH128800T006-ZHC01
- Innolux G121X1-L03
- LG sw43408
- Khadas TS050 V2
- EDO RM69380 OLED
- CSOT MNB601LS1-1
amdgpu:
- HDCP/ODM/RAS fixes
- Devcoredump improvements
- Expose VCN activity via sysfs
- SMY 13.0.x updates
- Enable fast updates on DCN 3.1.4
- Add dclk and vclk reporting on additional devices
- Add ACA RAS infrastructure
- Implement TLB flush fence
- EEPROM handling fixes
- SMUIO 14.0.2 support
- SMU 14.0.1 Updates
- SMU 14.0.2 support
- Sync page table freeing with TLB flushes
- DML2 refactor
- DC debug improvements
- DCN 3.5.x Updates
- GPU reset fixes
- HDP fix for second GFX pipe on GC 10.x
- Enable secondary GFX pipe on GC 10.3
- Refactor and clean up BACO/BOCO/BAMACO handling
- Remove invalid TTM resource start check
- UAF fix in VA IOCTL
- GPUVM page fault redirection to secondary IH rings for IH 6.x
- Initial support for mapping kernel queues via MES
- Fix VRAM memory accounting
amdkfd:
- MQD handling cleanup
- Preemption handling fixes for XCDs
- TLB flush fix for GC 9.4.2
- Properly clean up workqueue during module unload
- Fix memory leak process create failure
- Range check CP bad op exception targets to avoid reporting invalid exceptions to userspace
- Fix eviction fence handling
- Fix leak in GPU memory allocation failure case
- DMABuf import handling fix
- Enable SQ watchpoint for gfx10
i915:
- Adding new DG2 PCI ID
- add context hints for GT frequency
- enable only one CCS for compute workloads
- new workarounds
- Fix UAF on destroy against retire race and remove two earlier partial fixes
- Limit the reserved VM space to only the platforms that need it
- Fix gt reset with GuC submission is disable
- Add and use gt_to_guc() wrapper
i915/xe display:
- Lunar Lake display enabling, including cdclk and other refactors
- BIOS/VBT/opregion related refactor
- Digital port related refactor/clean-up
- Fix 2s boot time regression on DP panel replay init
- Remove duplication on audio enable/disable on SDVO and g4x+ DP
- Disable AuxCCS framebuffers if built for Xe
- Make crtc disable more atomic
- Increase DP idle pattern wait timeout to 2ms
- Start using container_of_const() for some extra const safety
- Fix Jasper Lake boot freeze
- Enable MST mode for 128b/132b single-stream sideband
- Enable Adaptive Sync SDP Support for DP
- Fix MTL supported DP rates - removal of UHBR13.5
- PLL refactoring
- Limit eDP MSO pipe only for display version 20
- More display refactor towards independence from i915 dev_priv
- Convert i915/xe fbdev to DRM client
- More initial work to make display code more independent from i915
xe:
- improved error capture
- clean up some uAPI leftovers
- devcoredump update
- Add BMG mocs table
- Handle GSCCS ER interrupt
- Implement xe2- and GuC workarounds
- struct xe_device cleanup
- Hwmon updates
- Add LRC parsing for more GPU instruction
- Increase VM_BIND number of per-ioctl Ops
- drm/xe: Add XE_BO_GGTT_INVALIDATE flag
- Initial development for SR-IOV support
- Add new PCI IDs to DG2 platform
- Move userptr over to start using hmm_range_fault
msm:
- Switched to generating register header files during build process
instead of shipping pre-generated headers
- Merged DPU and MDP4 format databases.
- DP:
- Stop using compat string to distinguish DP and eDP cases
- Added support for X Elite platform (X1E80100)
- Reworked DP aux/audio support
- Added SM6350 DP to the bindings
- GPU:
- a7xx perfcntr reg fixes
- MAINTAINERS updates
- a750 devcoredump support
radeon:
- Silence UBSAN warnings related to flexible arrays
nouveau:
- move some uAPI objects to uapi headers
omapdrm:
- console fix
ast:
- add i2c polling
qaic:
- add debugfs entries
exynos:
- fix platform_driver .owner
- drop cleanup code
mediatek:
- Use devm_platform_get_and_ioremap_resource() in mtk_hdmi_ddc_probe()
- Add GAMMA 12-bit LUT support for MT8188
- Rename mtk_drm_* to mtk_*
- Drop driver owner initialization
- Correct calculation formula of PHY Timing"
* tag 'drm-next-2024-05-15' of https://gitlab.freedesktop.org/drm/kernel: (1477 commits)
drm/xe/ads: Use flexible-array
drm/xe: Use ordered WQ for G2H handler
drm/msm/gen_header: allow skipping the validation
drm/msm/a6xx: Cleanup indexed regs const'ness
drm/msm: Add devcoredump support for a750
drm/msm: Adjust a7xx GBIF debugbus dumping
drm/msm: Update a6xx registers XML
drm/msm: Fix imported a750 snapshot header for upstream
drm/msm: Import a750 snapshot registers from kgsl
MAINTAINERS: Add Konrad Dybcio as a reviewer for the Adreno driver
MAINTAINERS: Add a separate entry for Qualcomm Adreno GPU drivers
drm/msm/a6xx: Avoid a nullptr dereference when speedbin setting fails
drm/msm/adreno: fix CP cycles stat retrieval on a7xx
drm/msm/a7xx: allow writing to CP_BV counter selection registers
drm: zynqmp_dpsub: Always register bridge
Revert "drm/bridge: ti-sn65dsi83: Fix enable error path"
drm/fb_dma: Add checks in drm_fb_dma_get_scanout_buffer()
drm/fbdev-generic: Do not set physical framebuffer address
drm/panthor: Fix the FW reset logic
drm/panthor: Make sure we handle 'unknown group state' case properly
...
This commit is contained in:
@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
|
||||
amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
|
||||
atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
|
||||
atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
|
||||
amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
|
||||
amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
|
||||
amdgpu_ib.o amdgpu_pll.o \
|
||||
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
|
||||
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
|
||||
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
|
||||
@@ -80,7 +81,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
|
||||
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
|
||||
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
|
||||
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
|
||||
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o
|
||||
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o
|
||||
|
||||
amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
|
||||
|
||||
@@ -247,7 +248,8 @@ amdgpu-y += \
|
||||
smuio_v11_0_6.o \
|
||||
smuio_v13_0.o \
|
||||
smuio_v13_0_3.o \
|
||||
smuio_v13_0_6.o
|
||||
smuio_v13_0_6.o \
|
||||
smuio_v14_0_2.o
|
||||
|
||||
# add reset block
|
||||
amdgpu-y += \
|
||||
|
||||
@@ -97,7 +97,7 @@ static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)
|
||||
adev->ip_blocks[i].status.hw = false;
|
||||
}
|
||||
|
||||
return r;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
|
||||
@@ -139,6 +139,14 @@ enum amdgpu_ss {
|
||||
AMDGPU_SS_DRV_UNLOAD
|
||||
};
|
||||
|
||||
struct amdgpu_hwip_reg_entry {
|
||||
u32 hwip;
|
||||
u32 inst;
|
||||
u32 seg;
|
||||
u32 reg_offset;
|
||||
const char *reg_name;
|
||||
};
|
||||
|
||||
struct amdgpu_watchdog_timer {
|
||||
bool timeout_fatal_disable;
|
||||
uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
|
||||
@@ -494,6 +502,7 @@ struct amdgpu_wb {
|
||||
uint64_t gpu_addr;
|
||||
u32 num_wb; /* Number of wb slots actually reserved for amdgpu. */
|
||||
unsigned long used[DIV_ROUND_UP(AMDGPU_MAX_WB, BITS_PER_LONG)];
|
||||
spinlock_t lock;
|
||||
};
|
||||
|
||||
int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb);
|
||||
@@ -606,7 +615,7 @@ struct amdgpu_asic_funcs {
|
||||
/* PCIe replay counter */
|
||||
uint64_t (*get_pcie_replay_count)(struct amdgpu_device *adev);
|
||||
/* device supports BACO */
|
||||
bool (*supports_baco)(struct amdgpu_device *adev);
|
||||
int (*supports_baco)(struct amdgpu_device *adev);
|
||||
/* pre asic_init quirks */
|
||||
void (*pre_asic_init)(struct amdgpu_device *adev);
|
||||
/* enter/exit umd stable pstate */
|
||||
@@ -1408,7 +1417,8 @@ bool amdgpu_device_supports_atpx(struct drm_device *dev);
|
||||
bool amdgpu_device_supports_px(struct drm_device *dev);
|
||||
bool amdgpu_device_supports_boco(struct drm_device *dev);
|
||||
bool amdgpu_device_supports_smart_shift(struct drm_device *dev);
|
||||
bool amdgpu_device_supports_baco(struct drm_device *dev);
|
||||
int amdgpu_device_supports_baco(struct drm_device *dev);
|
||||
void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev);
|
||||
bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
|
||||
struct amdgpu_device *peer_adev);
|
||||
int amdgpu_device_baco_enter(struct drm_device *dev);
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
|
||||
#define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype}
|
||||
|
||||
typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, void *data);
|
||||
typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
|
||||
|
||||
struct aca_banks {
|
||||
int nr_banks;
|
||||
@@ -86,7 +86,7 @@ static void aca_banks_release(struct aca_banks *banks)
|
||||
}
|
||||
}
|
||||
|
||||
static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_error_type type, u32 *count)
|
||||
static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count)
|
||||
{
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
|
||||
@@ -116,20 +116,22 @@ static struct aca_regs_dump {
|
||||
{"CONTROL_MASK", ACA_REG_IDX_CTL_MASK},
|
||||
};
|
||||
|
||||
static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank)
|
||||
static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
u64 event_id = qctx ? qctx->event_id : 0ULL;
|
||||
int i;
|
||||
|
||||
dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n");
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
|
||||
/* plus 1 for output format, e.g: ACA[08/08]: xxxx */
|
||||
for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
|
||||
dev_info(adev->dev, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
|
||||
idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
|
||||
idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
|
||||
}
|
||||
|
||||
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_error_type type,
|
||||
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
|
||||
int start, int count,
|
||||
struct aca_banks *banks)
|
||||
struct aca_banks *banks, struct ras_query_context *qctx)
|
||||
{
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
|
||||
@@ -143,13 +145,12 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
switch (type) {
|
||||
case ACA_ERROR_TYPE_UE:
|
||||
case ACA_SMU_TYPE_UE:
|
||||
max_count = smu_funcs->max_ue_bank_count;
|
||||
break;
|
||||
case ACA_ERROR_TYPE_CE:
|
||||
case ACA_SMU_TYPE_CE:
|
||||
max_count = smu_funcs->max_ce_bank_count;
|
||||
break;
|
||||
case ACA_ERROR_TYPE_DEFERRED:
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
@@ -164,7 +165,9 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
aca_smu_bank_dump(adev, i, count, &bank);
|
||||
bank.type = type;
|
||||
|
||||
aca_smu_bank_dump(adev, i, count, &bank, qctx);
|
||||
|
||||
ret = aca_banks_add_bank(banks, &bank);
|
||||
if (ret)
|
||||
@@ -195,7 +198,7 @@ static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type t
|
||||
return hwip->hwid == hwid && hwip->mcatype == mcatype;
|
||||
}
|
||||
|
||||
static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type)
|
||||
static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
|
||||
{
|
||||
const struct aca_bank_ops *bank_ops = handle->bank_ops;
|
||||
|
||||
@@ -273,59 +276,49 @@ static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_
|
||||
return new_bank_error(aerr, info);
|
||||
}
|
||||
|
||||
static int aca_log_errors(struct aca_handle *handle, enum aca_error_type type,
|
||||
struct aca_bank_report *report)
|
||||
int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info,
|
||||
enum aca_error_type type, u64 count)
|
||||
{
|
||||
struct aca_error_cache *error_cache = &handle->error_cache;
|
||||
struct aca_bank_error *bank_error;
|
||||
struct aca_error *aerr;
|
||||
|
||||
if (!handle || !report)
|
||||
if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT)
|
||||
return -EINVAL;
|
||||
|
||||
if (!report->count[type])
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
aerr = &error_cache->errors[type];
|
||||
bank_error = get_bank_error(aerr, &report->info);
|
||||
bank_error = get_bank_error(aerr, info);
|
||||
if (!bank_error)
|
||||
return -ENOMEM;
|
||||
|
||||
bank_error->count[type] += report->count[type];
|
||||
bank_error->count += count;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, struct aca_bank_report *report)
|
||||
static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
|
||||
{
|
||||
const struct aca_bank_ops *bank_ops = handle->bank_ops;
|
||||
|
||||
if (!bank || !report)
|
||||
if (!bank)
|
||||
return -EINVAL;
|
||||
|
||||
if (!bank_ops->aca_bank_generate_report)
|
||||
if (!bank_ops->aca_bank_parser)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
return bank_ops->aca_bank_generate_report(handle, bank, type,
|
||||
report, handle->data);
|
||||
return bank_ops->aca_bank_parser(handle, bank, type,
|
||||
handle->data);
|
||||
}
|
||||
|
||||
static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
struct aca_bank_report report;
|
||||
int ret;
|
||||
|
||||
ret = aca_generate_bank_report(handle, bank, type, &report);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!report.count[type])
|
||||
return 0;
|
||||
|
||||
ret = aca_log_errors(handle, type, &report);
|
||||
ret = aca_bank_parser(handle, bank, type);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -333,7 +326,7 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank
|
||||
}
|
||||
|
||||
static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
|
||||
enum aca_error_type type, bank_handler_t handler, void *data)
|
||||
enum aca_smu_type type, bank_handler_t handler, void *data)
|
||||
{
|
||||
struct aca_handle *handle;
|
||||
int ret;
|
||||
@@ -354,7 +347,7 @@ static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *ba
|
||||
}
|
||||
|
||||
static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
|
||||
enum aca_error_type type, bank_handler_t handler, void *data)
|
||||
enum aca_smu_type type, bank_handler_t handler, void *data)
|
||||
{
|
||||
struct aca_bank_node *node;
|
||||
struct aca_bank *bank;
|
||||
@@ -378,8 +371,28 @@ static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type,
|
||||
bank_handler_t handler, void *data)
|
||||
static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type)
|
||||
{
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
bool ret = true;
|
||||
|
||||
/*
|
||||
* Because the UE Valid MCA count will only be cleared after reset,
|
||||
* in order to avoid repeated counting of the error count,
|
||||
* the aca bank is only updated once during the gpu recovery stage.
|
||||
*/
|
||||
if (type == ACA_SMU_TYPE_UE) {
|
||||
if (amdgpu_ras_intr_triggered())
|
||||
ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0;
|
||||
else
|
||||
atomic_set(&aca->ue_update_flag, 0);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
|
||||
bank_handler_t handler, struct ras_query_context *qctx, void *data)
|
||||
{
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
struct aca_banks banks;
|
||||
@@ -389,9 +402,8 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type
|
||||
if (list_empty(&aca->mgr.list))
|
||||
return 0;
|
||||
|
||||
/* NOTE: pmfw is only support UE and CE */
|
||||
if (type == ACA_ERROR_TYPE_DEFERRED)
|
||||
type = ACA_ERROR_TYPE_CE;
|
||||
if (!aca_bank_should_update(adev, type))
|
||||
return 0;
|
||||
|
||||
ret = aca_smu_get_valid_aca_count(adev, type, &count);
|
||||
if (ret)
|
||||
@@ -402,7 +414,7 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type
|
||||
|
||||
aca_banks_init(&banks);
|
||||
|
||||
ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks);
|
||||
ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);
|
||||
if (ret)
|
||||
goto err_release_banks;
|
||||
|
||||
@@ -431,7 +443,7 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er
|
||||
if (type >= ACA_ERROR_TYPE_COUNT)
|
||||
return -EINVAL;
|
||||
|
||||
count = bank_error->count[type];
|
||||
count = bank_error->count;
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
@@ -447,6 +459,8 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er
|
||||
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, count);
|
||||
break;
|
||||
case ACA_ERROR_TYPE_DEFERRED:
|
||||
amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, count);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -477,12 +491,25 @@ out_unlock:
|
||||
}
|
||||
|
||||
static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type,
|
||||
struct ras_err_data *err_data)
|
||||
struct ras_err_data *err_data, struct ras_query_context *qctx)
|
||||
{
|
||||
enum aca_smu_type smu_type;
|
||||
int ret;
|
||||
|
||||
switch (type) {
|
||||
case ACA_ERROR_TYPE_UE:
|
||||
smu_type = ACA_SMU_TYPE_UE;
|
||||
break;
|
||||
case ACA_ERROR_TYPE_CE:
|
||||
case ACA_ERROR_TYPE_DEFERRED:
|
||||
smu_type = ACA_SMU_TYPE_CE;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* udpate aca bank to aca source error_cache first */
|
||||
ret = aca_banks_update(adev, type, handler_aca_log_bank_error, NULL);
|
||||
ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -498,10 +525,9 @@ static bool aca_handle_is_valid(struct aca_handle *handle)
|
||||
}
|
||||
|
||||
int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_error_type type, struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)data;
|
||||
|
||||
if (!handle || !err_data)
|
||||
return -EINVAL;
|
||||
|
||||
@@ -511,7 +537,7 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *han
|
||||
if (!(BIT(type) & handle->mask))
|
||||
return 0;
|
||||
|
||||
return __aca_get_error_data(adev, handle, type, err_data);
|
||||
return __aca_get_error_data(adev, handle, type, err_data, qctx);
|
||||
}
|
||||
|
||||
static void aca_error_init(struct aca_error *aerr, enum aca_error_type type)
|
||||
@@ -668,6 +694,8 @@ int amdgpu_aca_init(struct amdgpu_device *adev)
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
int ret;
|
||||
|
||||
atomic_set(&aca->ue_update_flag, 0);
|
||||
|
||||
ret = aca_manager_init(&aca->mgr);
|
||||
if (ret)
|
||||
return ret;
|
||||
@@ -680,6 +708,8 @@ void amdgpu_aca_fini(struct amdgpu_device *adev)
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
|
||||
aca_manager_fini(&aca->mgr);
|
||||
|
||||
atomic_set(&aca->ue_update_flag, 0);
|
||||
}
|
||||
|
||||
int amdgpu_aca_reset(struct amdgpu_device *adev)
|
||||
@@ -723,23 +753,13 @@ int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info)
|
||||
|
||||
static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank)
|
||||
{
|
||||
int error_code;
|
||||
struct amdgpu_aca *aca = &adev->aca;
|
||||
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
|
||||
|
||||
switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
|
||||
case IP_VERSION(13, 0, 6):
|
||||
if (!(adev->flags & AMD_IS_APU) && adev->pm.fw_version >= 0x00555600) {
|
||||
error_code = ACA_REG__SYND__ERRORINFORMATION(bank->regs[ACA_REG_IDX_SYND]);
|
||||
return error_code & 0xff;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (!smu_funcs || !smu_funcs->parse_error_code)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* NOTE: the true error code is encoded in status.errorcode[0:7] */
|
||||
error_code = ACA_REG__STATUS__ERRORCODE(bank->regs[ACA_REG_IDX_STATUS]);
|
||||
|
||||
return error_code & 0xff;
|
||||
return smu_funcs->parse_error_code(adev, bank);
|
||||
}
|
||||
|
||||
int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size)
|
||||
@@ -750,6 +770,9 @@ int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank
|
||||
return -EINVAL;
|
||||
|
||||
error_code = aca_bank_get_error_code(adev, bank);
|
||||
if (error_code < 0)
|
||||
return error_code;
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if (err_codes[i] == error_code)
|
||||
return 0;
|
||||
@@ -784,7 +807,7 @@ static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_error_type type, int idx)
|
||||
static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx)
|
||||
{
|
||||
struct aca_bank_info info;
|
||||
int i, ret;
|
||||
@@ -793,7 +816,7 @@ static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_e
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_ERROR_TYPE_UE ? "UE" : "CE");
|
||||
seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE");
|
||||
seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
|
||||
idx, info.socket_id, info.die_id, info.hwid, info.mcatype);
|
||||
|
||||
@@ -807,7 +830,7 @@ struct aca_dump_context {
|
||||
};
|
||||
|
||||
static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
struct aca_dump_context *ctx = (struct aca_dump_context *)data;
|
||||
|
||||
@@ -816,7 +839,7 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban
|
||||
return handler_aca_log_bank_error(handle, bank, type, NULL);
|
||||
}
|
||||
|
||||
static int aca_dump_show(struct seq_file *m, enum aca_error_type type)
|
||||
static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
|
||||
struct aca_dump_context context = {
|
||||
@@ -824,12 +847,12 @@ static int aca_dump_show(struct seq_file *m, enum aca_error_type type)
|
||||
.idx = 0,
|
||||
};
|
||||
|
||||
return aca_banks_update(adev, type, handler_aca_bank_dump, (void *)&context);
|
||||
return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);
|
||||
}
|
||||
|
||||
static int aca_dump_ce_show(struct seq_file *m, void *unused)
|
||||
{
|
||||
return aca_dump_show(m, ACA_ERROR_TYPE_CE);
|
||||
return aca_dump_show(m, ACA_SMU_TYPE_CE);
|
||||
}
|
||||
|
||||
static int aca_dump_ce_open(struct inode *inode, struct file *file)
|
||||
@@ -847,7 +870,7 @@ static const struct file_operations aca_ce_dump_debug_fops = {
|
||||
|
||||
static int aca_dump_ue_show(struct seq_file *m, void *unused)
|
||||
{
|
||||
return aca_dump_show(m, ACA_ERROR_TYPE_UE);
|
||||
return aca_dump_show(m, ACA_SMU_TYPE_UE);
|
||||
}
|
||||
|
||||
static int aca_dump_ue_open(struct inode *inode, struct file *file)
|
||||
|
||||
@@ -26,6 +26,9 @@
|
||||
|
||||
#include <linux/list.h>
|
||||
|
||||
struct ras_err_data;
|
||||
struct ras_query_context;
|
||||
|
||||
#define ACA_MAX_REGS_COUNT (16)
|
||||
|
||||
#define ACA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l)
|
||||
@@ -99,7 +102,14 @@ enum aca_error_type {
|
||||
ACA_ERROR_TYPE_COUNT
|
||||
};
|
||||
|
||||
enum aca_smu_type {
|
||||
ACA_SMU_TYPE_UE = 0,
|
||||
ACA_SMU_TYPE_CE,
|
||||
ACA_SMU_TYPE_COUNT,
|
||||
};
|
||||
|
||||
struct aca_bank {
|
||||
enum aca_smu_type type;
|
||||
u64 regs[ACA_MAX_REGS_COUNT];
|
||||
};
|
||||
|
||||
@@ -115,15 +125,10 @@ struct aca_bank_info {
|
||||
int mcatype;
|
||||
};
|
||||
|
||||
struct aca_bank_report {
|
||||
struct aca_bank_info info;
|
||||
u64 count[ACA_ERROR_TYPE_COUNT];
|
||||
};
|
||||
|
||||
struct aca_bank_error {
|
||||
struct list_head node;
|
||||
struct aca_bank_info info;
|
||||
u64 count[ACA_ERROR_TYPE_COUNT];
|
||||
u64 count;
|
||||
};
|
||||
|
||||
struct aca_error {
|
||||
@@ -157,9 +162,8 @@ struct aca_handle {
|
||||
};
|
||||
|
||||
struct aca_bank_ops {
|
||||
int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data);
|
||||
bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
|
||||
int (*aca_bank_parser)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
|
||||
bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
|
||||
void *data);
|
||||
};
|
||||
|
||||
@@ -167,13 +171,15 @@ struct aca_smu_funcs {
|
||||
int max_ue_bank_count;
|
||||
int max_ce_bank_count;
|
||||
int (*set_debug_mode)(struct amdgpu_device *adev, bool enable);
|
||||
int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_error_type type, u32 *count);
|
||||
int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_error_type type, int idx, struct aca_bank *bank);
|
||||
int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count);
|
||||
int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank);
|
||||
int (*parse_error_code)(struct amdgpu_device *adev, struct aca_bank *bank);
|
||||
};
|
||||
|
||||
struct amdgpu_aca {
|
||||
struct aca_handle_manager mgr;
|
||||
const struct aca_smu_funcs *smu_funcs;
|
||||
atomic_t ue_update_flag;
|
||||
bool is_enabled;
|
||||
};
|
||||
|
||||
@@ -196,7 +202,10 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,
|
||||
const char *name, const struct aca_info *aca_info, void *data);
|
||||
void amdgpu_aca_remove_handle(struct aca_handle *handle);
|
||||
int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle,
|
||||
enum aca_error_type type, void *data);
|
||||
enum aca_error_type type, struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx);
|
||||
int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en);
|
||||
void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root);
|
||||
int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info,
|
||||
enum aca_error_type type, u64 count);
|
||||
#endif
|
||||
|
||||
@@ -637,6 +637,8 @@ static const struct amd_ip_funcs acp_ip_funcs = {
|
||||
.soft_reset = acp_soft_reset,
|
||||
.set_clockgating_state = acp_set_clockgating_state,
|
||||
.set_powergating_state = acp_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
const struct amdgpu_ip_block_version acp_ip_block = {
|
||||
|
||||
@@ -747,10 +747,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
|
||||
return amdgpu_ras_get_fed_status(adev);
|
||||
}
|
||||
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset)
|
||||
void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, uint16_t pasid,
|
||||
pasid_notify pasid_fn, void *data, uint32_t reset)
|
||||
{
|
||||
amdgpu_umc_poison_handler(adev, block, reset);
|
||||
amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn, data, reset);
|
||||
}
|
||||
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, uint32_t reset)
|
||||
{
|
||||
amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
|
||||
}
|
||||
|
||||
int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
|
||||
@@ -769,12 +776,20 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
|
||||
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int hub_inst, int hub_type)
|
||||
{
|
||||
if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
|
||||
return adev->gfx.ras->query_utcl2_poison_status(adev);
|
||||
else
|
||||
return false;
|
||||
if (!hub_type) {
|
||||
if (adev->gfxhub.funcs->query_utcl2_poison_status)
|
||||
return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
|
||||
else
|
||||
return false;
|
||||
} else {
|
||||
if (adev->mmhub.funcs->query_utcl2_poison_status)
|
||||
return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
|
||||
|
||||
@@ -336,12 +336,18 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
|
||||
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
|
||||
struct tile_config *config);
|
||||
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset);
|
||||
enum amdgpu_ras_block block, uint32_t reset);
|
||||
|
||||
void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, uint16_t pasid,
|
||||
pasid_notify pasid_fn, void *data, uint32_t reset);
|
||||
|
||||
bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
|
||||
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
|
||||
void amdgpu_amdkfd_block_mmu_notifications(void *p);
|
||||
int amdgpu_amdkfd_criu_resume(void *p);
|
||||
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
|
||||
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int hub_inst, int hub_type);
|
||||
int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
|
||||
uint64_t size, u32 alloc_flag, int8_t xcp_id);
|
||||
void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
|
||||
|
||||
@@ -881,6 +881,7 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
|
||||
}
|
||||
|
||||
#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
|
||||
#define SQ_WATCH_STRIDE (mmSQ_WATCH1_ADDR_H - mmSQ_WATCH0_ADDR_H)
|
||||
uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
|
||||
uint64_t watch_address,
|
||||
uint32_t watch_address_mask,
|
||||
@@ -889,55 +890,93 @@ uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
|
||||
uint32_t debug_vmid,
|
||||
uint32_t inst)
|
||||
{
|
||||
/* SQ_WATCH?_ADDR_* and TCP_WATCH?_ADDR_* are programmed with the
|
||||
* same values.
|
||||
*/
|
||||
uint32_t watch_address_high;
|
||||
uint32_t watch_address_low;
|
||||
uint32_t watch_address_cntl;
|
||||
|
||||
watch_address_cntl = 0;
|
||||
uint32_t tcp_watch_address_cntl;
|
||||
uint32_t sq_watch_address_cntl;
|
||||
|
||||
watch_address_low = lower_32_bits(watch_address);
|
||||
watch_address_high = upper_32_bits(watch_address) & 0xffff;
|
||||
|
||||
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
|
||||
tcp_watch_address_cntl = 0;
|
||||
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
|
||||
TCP_WATCH0_CNTL,
|
||||
VMID,
|
||||
debug_vmid);
|
||||
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
|
||||
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
|
||||
TCP_WATCH0_CNTL,
|
||||
MODE,
|
||||
watch_mode);
|
||||
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
|
||||
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
|
||||
TCP_WATCH0_CNTL,
|
||||
MASK,
|
||||
watch_address_mask >> 7);
|
||||
|
||||
sq_watch_address_cntl = 0;
|
||||
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
|
||||
SQ_WATCH0_CNTL,
|
||||
VMID,
|
||||
debug_vmid);
|
||||
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
|
||||
SQ_WATCH0_CNTL,
|
||||
MODE,
|
||||
watch_mode);
|
||||
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
|
||||
SQ_WATCH0_CNTL,
|
||||
MASK,
|
||||
watch_address_mask >> 6);
|
||||
|
||||
/* Turning off this watch point until we set all the registers */
|
||||
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
|
||||
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
|
||||
TCP_WATCH0_CNTL,
|
||||
VALID,
|
||||
0);
|
||||
|
||||
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
|
||||
(watch_id * TCP_WATCH_STRIDE)),
|
||||
watch_address_cntl);
|
||||
tcp_watch_address_cntl);
|
||||
|
||||
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
|
||||
SQ_WATCH0_CNTL,
|
||||
VALID,
|
||||
0);
|
||||
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
|
||||
(watch_id * SQ_WATCH_STRIDE)),
|
||||
sq_watch_address_cntl);
|
||||
|
||||
/* Program {TCP,SQ}_WATCH?_ADDR* */
|
||||
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
|
||||
(watch_id * TCP_WATCH_STRIDE)),
|
||||
watch_address_high);
|
||||
|
||||
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
|
||||
(watch_id * TCP_WATCH_STRIDE)),
|
||||
watch_address_low);
|
||||
|
||||
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_H) +
|
||||
(watch_id * SQ_WATCH_STRIDE)),
|
||||
watch_address_high);
|
||||
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_L) +
|
||||
(watch_id * SQ_WATCH_STRIDE)),
|
||||
watch_address_low);
|
||||
|
||||
/* Enable the watch point */
|
||||
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
|
||||
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
|
||||
TCP_WATCH0_CNTL,
|
||||
VALID,
|
||||
1);
|
||||
|
||||
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
|
||||
(watch_id * TCP_WATCH_STRIDE)),
|
||||
watch_address_cntl);
|
||||
tcp_watch_address_cntl);
|
||||
|
||||
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
|
||||
SQ_WATCH0_CNTL,
|
||||
VALID,
|
||||
1);
|
||||
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
|
||||
(watch_id * SQ_WATCH_STRIDE)),
|
||||
sq_watch_address_cntl);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -953,8 +992,14 @@ uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
|
||||
(watch_id * TCP_WATCH_STRIDE)),
|
||||
watch_address_cntl);
|
||||
|
||||
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
|
||||
(watch_id * SQ_WATCH_STRIDE)),
|
||||
watch_address_cntl);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#undef TCP_WATCH_STRIDE
|
||||
#undef SQ_WATCH_STRIDE
|
||||
|
||||
|
||||
/* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
|
||||
|
||||
@@ -34,6 +34,7 @@ union firmware_info {
|
||||
struct atom_firmware_info_v3_2 v32;
|
||||
struct atom_firmware_info_v3_3 v33;
|
||||
struct atom_firmware_info_v3_4 v34;
|
||||
struct atom_firmware_info_v3_5 v35;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -872,6 +873,10 @@ int amdgpu_atomfirmware_get_fw_reserved_fb_size(struct amdgpu_device *adev)
|
||||
fw_reserved_fb_size =
|
||||
(firmware_info->v34.fw_reserved_size_in_kb << 10);
|
||||
break;
|
||||
case 5:
|
||||
fw_reserved_fb_size =
|
||||
(firmware_info->v35.fw_reserved_size_in_kb << 10);
|
||||
break;
|
||||
default:
|
||||
fw_reserved_fb_size = 0;
|
||||
break;
|
||||
|
||||
@@ -39,7 +39,7 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,
|
||||
for (i = 0; i < n; i++) {
|
||||
struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
|
||||
r = amdgpu_copy_buffer(ring, saddr, daddr, size, NULL, &fence,
|
||||
false, false, false);
|
||||
false, false, 0);
|
||||
if (r)
|
||||
goto exit_do_move;
|
||||
r = dma_fence_wait(fence, false);
|
||||
|
||||
@@ -2065,12 +2065,13 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
|
||||
char reg_offset[11];
|
||||
uint32_t *new = NULL, *tmp = NULL;
|
||||
int ret, i = 0, len = 0;
|
||||
unsigned int len = 0;
|
||||
int ret, i = 0;
|
||||
|
||||
do {
|
||||
memset(reg_offset, 0, 11);
|
||||
if (copy_from_user(reg_offset, buf + len,
|
||||
min(10, ((int)size-len)))) {
|
||||
min(10, (size-len)))) {
|
||||
ret = -EFAULT;
|
||||
goto error_free;
|
||||
}
|
||||
|
||||
360
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
Normal file
360
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
Normal file
@@ -0,0 +1,360 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
/*
|
||||
* Copyright 2024 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <generated/utsrelease.h>
|
||||
#include <linux/devcoredump.h>
|
||||
#include "amdgpu_dev_coredump.h"
|
||||
#include "atom.h"
|
||||
|
||||
#ifndef CONFIG_DEV_COREDUMP
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
}
|
||||
#else
|
||||
|
||||
const char *hw_ip_names[MAX_HWIP] = {
|
||||
[GC_HWIP] = "GC",
|
||||
[HDP_HWIP] = "HDP",
|
||||
[SDMA0_HWIP] = "SDMA0",
|
||||
[SDMA1_HWIP] = "SDMA1",
|
||||
[SDMA2_HWIP] = "SDMA2",
|
||||
[SDMA3_HWIP] = "SDMA3",
|
||||
[SDMA4_HWIP] = "SDMA4",
|
||||
[SDMA5_HWIP] = "SDMA5",
|
||||
[SDMA6_HWIP] = "SDMA6",
|
||||
[SDMA7_HWIP] = "SDMA7",
|
||||
[LSDMA_HWIP] = "LSDMA",
|
||||
[MMHUB_HWIP] = "MMHUB",
|
||||
[ATHUB_HWIP] = "ATHUB",
|
||||
[NBIO_HWIP] = "NBIO",
|
||||
[MP0_HWIP] = "MP0",
|
||||
[MP1_HWIP] = "MP1",
|
||||
[UVD_HWIP] = "UVD/JPEG/VCN",
|
||||
[VCN1_HWIP] = "VCN1",
|
||||
[VCE_HWIP] = "VCE",
|
||||
[VPE_HWIP] = "VPE",
|
||||
[DF_HWIP] = "DF",
|
||||
[DCE_HWIP] = "DCE",
|
||||
[OSSSYS_HWIP] = "OSSSYS",
|
||||
[SMUIO_HWIP] = "SMUIO",
|
||||
[PWR_HWIP] = "PWR",
|
||||
[NBIF_HWIP] = "NBIF",
|
||||
[THM_HWIP] = "THM",
|
||||
[CLK_HWIP] = "CLK",
|
||||
[UMC_HWIP] = "UMC",
|
||||
[RSMU_HWIP] = "RSMU",
|
||||
[XGMI_HWIP] = "XGMI",
|
||||
[DCI_HWIP] = "DCI",
|
||||
[PCIE_HWIP] = "PCIE",
|
||||
};
|
||||
|
||||
static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
|
||||
struct drm_printer *p)
|
||||
{
|
||||
uint32_t version;
|
||||
uint32_t feature;
|
||||
uint8_t smu_program, smu_major, smu_minor, smu_debug;
|
||||
struct atom_context *ctx = adev->mode_info.atom_context;
|
||||
|
||||
drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n",
|
||||
adev->vce.fb_version, adev->vce.fw_version);
|
||||
drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->uvd.fw_version);
|
||||
drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->gmc.fw_version);
|
||||
drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.me_feature_version, adev->gfx.me_fw_version);
|
||||
drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version);
|
||||
drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.ce_feature_version, adev->gfx.ce_fw_version);
|
||||
drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version);
|
||||
|
||||
drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlc_srlc_feature_version,
|
||||
adev->gfx.rlc_srlc_fw_version);
|
||||
drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlc_srlg_feature_version,
|
||||
adev->gfx.rlc_srlg_fw_version);
|
||||
drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlc_srls_feature_version,
|
||||
adev->gfx.rlc_srls_fw_version);
|
||||
drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlcp_ucode_feature_version,
|
||||
adev->gfx.rlcp_ucode_version);
|
||||
drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.rlcv_ucode_feature_version,
|
||||
adev->gfx.rlcv_ucode_version);
|
||||
drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.mec_feature_version, adev->gfx.mec_fw_version);
|
||||
|
||||
if (adev->gfx.mec2_fw)
|
||||
drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n",
|
||||
adev->gfx.mec2_feature_version,
|
||||
adev->gfx.mec2_fw_version);
|
||||
|
||||
drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->gfx.imu_fw_version);
|
||||
drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n",
|
||||
adev->psp.sos.feature_version, adev->psp.sos.fw_version);
|
||||
drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n",
|
||||
adev->psp.asd_context.bin_desc.feature_version,
|
||||
adev->psp.asd_context.bin_desc.fw_version);
|
||||
|
||||
drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.xgmi_context.context.bin_desc.feature_version,
|
||||
adev->psp.xgmi_context.context.bin_desc.fw_version);
|
||||
drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.ras_context.context.bin_desc.feature_version,
|
||||
adev->psp.ras_context.context.bin_desc.fw_version);
|
||||
drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.hdcp_context.context.bin_desc.feature_version,
|
||||
adev->psp.hdcp_context.context.bin_desc.fw_version);
|
||||
drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.dtm_context.context.bin_desc.feature_version,
|
||||
adev->psp.dtm_context.context.bin_desc.fw_version);
|
||||
drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.rap_context.context.bin_desc.feature_version,
|
||||
adev->psp.rap_context.context.bin_desc.fw_version);
|
||||
drm_printf(p,
|
||||
"TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n",
|
||||
adev->psp.securedisplay_context.context.bin_desc.feature_version,
|
||||
adev->psp.securedisplay_context.context.bin_desc.fw_version);
|
||||
|
||||
/* SMC firmware */
|
||||
version = adev->pm.fw_version;
|
||||
|
||||
smu_program = (version >> 24) & 0xff;
|
||||
smu_major = (version >> 16) & 0xff;
|
||||
smu_minor = (version >> 8) & 0xff;
|
||||
smu_debug = (version >> 0) & 0xff;
|
||||
drm_printf(p,
|
||||
"SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n",
|
||||
0, smu_program, version, smu_major, smu_minor, smu_debug);
|
||||
|
||||
/* SDMA firmware */
|
||||
for (int i = 0; i < adev->sdma.num_instances; i++) {
|
||||
drm_printf(p,
|
||||
"SDMA%d feature version: %u, firmware version: 0x%08x\n",
|
||||
i, adev->sdma.instance[i].feature_version,
|
||||
adev->sdma.instance[i].fw_version);
|
||||
}
|
||||
|
||||
drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->vcn.fw_version);
|
||||
drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->dm.dmcu_fw_version);
|
||||
drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0,
|
||||
adev->dm.dmcub_fw_version);
|
||||
drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n",
|
||||
adev->psp.toc.feature_version, adev->psp.toc.fw_version);
|
||||
|
||||
version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
|
||||
feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
|
||||
AMDGPU_MES_FEAT_VERSION_SHIFT;
|
||||
drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n",
|
||||
feature, version);
|
||||
|
||||
version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
|
||||
feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
|
||||
AMDGPU_MES_FEAT_VERSION_SHIFT;
|
||||
drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature,
|
||||
version);
|
||||
|
||||
drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n",
|
||||
adev->vpe.feature_version, adev->vpe.fw_version);
|
||||
|
||||
drm_printf(p, "\nVBIOS Information\n");
|
||||
drm_printf(p, "vbios name : %s\n", ctx->name);
|
||||
drm_printf(p, "vbios pn : %s\n", ctx->vbios_pn);
|
||||
drm_printf(p, "vbios version : %d\n", ctx->version);
|
||||
drm_printf(p, "vbios ver_str : %s\n", ctx->vbios_ver_str);
|
||||
drm_printf(p, "vbios date : %s\n", ctx->date);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
|
||||
void *data, size_t datalen)
|
||||
{
|
||||
struct drm_printer p;
|
||||
struct amdgpu_coredump_info *coredump = data;
|
||||
struct drm_print_iterator iter;
|
||||
struct amdgpu_vm_fault_info *fault_info;
|
||||
int i, ver;
|
||||
|
||||
iter.data = buffer;
|
||||
iter.offset = 0;
|
||||
iter.start = offset;
|
||||
iter.remain = count;
|
||||
|
||||
p = drm_coredump_printer(&iter);
|
||||
|
||||
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
|
||||
drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
|
||||
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
|
||||
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
|
||||
drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
|
||||
coredump->reset_time.tv_nsec);
|
||||
|
||||
if (coredump->reset_task_info.pid)
|
||||
drm_printf(&p, "process_name: %s PID: %d\n",
|
||||
coredump->reset_task_info.process_name,
|
||||
coredump->reset_task_info.pid);
|
||||
|
||||
/* GPU IP's information of the SOC */
|
||||
drm_printf(&p, "\nIP Information\n");
|
||||
drm_printf(&p, "SOC Family: %d\n", coredump->adev->family);
|
||||
drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id);
|
||||
drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
|
||||
|
||||
for (int i = 1; i < MAX_HWIP; i++) {
|
||||
for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
|
||||
ver = coredump->adev->ip_versions[i][j];
|
||||
if (ver)
|
||||
drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
|
||||
hw_ip_names[i], i, j,
|
||||
IP_VERSION_MAJ(ver),
|
||||
IP_VERSION_MIN(ver),
|
||||
IP_VERSION_REV(ver),
|
||||
IP_VERSION_VARIANT(ver),
|
||||
IP_VERSION_SUBREV(ver));
|
||||
}
|
||||
}
|
||||
|
||||
/* IP firmware information */
|
||||
drm_printf(&p, "\nIP Firmwares\n");
|
||||
amdgpu_devcoredump_fw_info(coredump->adev, &p);
|
||||
|
||||
if (coredump->ring) {
|
||||
drm_printf(&p, "\nRing timed out details\n");
|
||||
drm_printf(&p, "IP Type: %d Ring Name: %s\n",
|
||||
coredump->ring->funcs->type,
|
||||
coredump->ring->name);
|
||||
}
|
||||
|
||||
/* Add page fault information */
|
||||
fault_info = &coredump->adev->vm_manager.fault_info;
|
||||
drm_printf(&p, "\n[%s] Page fault observed\n",
|
||||
fault_info->vmhub ? "mmhub" : "gfxhub");
|
||||
drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
|
||||
drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
|
||||
|
||||
/* dump the ip state for each ip */
|
||||
drm_printf(&p, "IP Dump\n");
|
||||
for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
|
||||
if (coredump->adev->ip_blocks[i].version->funcs->print_ip_state) {
|
||||
drm_printf(&p, "IP: %s\n",
|
||||
coredump->adev->ip_blocks[i]
|
||||
.version->funcs->name);
|
||||
coredump->adev->ip_blocks[i]
|
||||
.version->funcs->print_ip_state(
|
||||
(void *)coredump->adev, &p);
|
||||
drm_printf(&p, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* Add ring buffer information */
|
||||
drm_printf(&p, "Ring buffer information\n");
|
||||
for (int i = 0; i < coredump->adev->num_rings; i++) {
|
||||
int j = 0;
|
||||
struct amdgpu_ring *ring = coredump->adev->rings[i];
|
||||
|
||||
drm_printf(&p, "ring name: %s\n", ring->name);
|
||||
drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
|
||||
amdgpu_ring_get_rptr(ring),
|
||||
amdgpu_ring_get_wptr(ring),
|
||||
ring->buf_mask);
|
||||
drm_printf(&p, "Ring size in dwords: %d\n",
|
||||
ring->ring_size / 4);
|
||||
drm_printf(&p, "Ring contents\n");
|
||||
drm_printf(&p, "Offset \t Value\n");
|
||||
|
||||
while (j < ring->ring_size) {
|
||||
drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j / 4]);
|
||||
j += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (coredump->reset_vram_lost)
|
||||
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
|
||||
if (coredump->adev->reset_info.num_regs) {
|
||||
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
|
||||
|
||||
for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
|
||||
drm_printf(&p, "0x%08x: 0x%08x\n",
|
||||
coredump->adev->reset_info.reset_dump_reg_list[i],
|
||||
coredump->adev->reset_info.reset_dump_reg_value[i]);
|
||||
}
|
||||
|
||||
return count - iter.remain;
|
||||
}
|
||||
|
||||
static void amdgpu_devcoredump_free(void *data)
|
||||
{
|
||||
kfree(data);
|
||||
}
|
||||
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
struct amdgpu_coredump_info *coredump;
|
||||
struct drm_device *dev = adev_to_drm(adev);
|
||||
struct amdgpu_job *job = reset_context->job;
|
||||
struct drm_sched_job *s_job;
|
||||
|
||||
coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
|
||||
|
||||
if (!coredump) {
|
||||
DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
coredump->reset_vram_lost = vram_lost;
|
||||
|
||||
if (reset_context->job && reset_context->job->vm) {
|
||||
struct amdgpu_task_info *ti;
|
||||
struct amdgpu_vm *vm = reset_context->job->vm;
|
||||
|
||||
ti = amdgpu_vm_get_task_info_vm(vm);
|
||||
if (ti) {
|
||||
coredump->reset_task_info = *ti;
|
||||
amdgpu_vm_put_task_info(ti);
|
||||
}
|
||||
}
|
||||
|
||||
if (job) {
|
||||
s_job = &job->base;
|
||||
coredump->ring = to_amdgpu_ring(s_job->sched);
|
||||
}
|
||||
|
||||
coredump->adev = adev;
|
||||
|
||||
ktime_get_ts64(&coredump->reset_time);
|
||||
|
||||
dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
|
||||
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
|
||||
}
|
||||
#endif
|
||||
47
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
Normal file
47
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/* SPDX-License-Identifier: MIT */
|
||||
/*
|
||||
* Copyright 2024 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __AMDGPU_DEV_COREDUMP_H__
|
||||
#define __AMDGPU_DEV_COREDUMP_H__
|
||||
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_reset.h"
|
||||
|
||||
#ifdef CONFIG_DEV_COREDUMP
|
||||
|
||||
#define AMDGPU_COREDUMP_VERSION "1"
|
||||
|
||||
struct amdgpu_coredump_info {
|
||||
struct amdgpu_device *adev;
|
||||
struct amdgpu_task_info reset_task_info;
|
||||
struct timespec64 reset_time;
|
||||
bool reset_vram_lost;
|
||||
struct amdgpu_ring *ring;
|
||||
};
|
||||
#endif
|
||||
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context);
|
||||
|
||||
#endif
|
||||
@@ -74,6 +74,7 @@
|
||||
#include "amdgpu_fru_eeprom.h"
|
||||
#include "amdgpu_reset.h"
|
||||
#include "amdgpu_virt.h"
|
||||
#include "amdgpu_dev_coredump.h"
|
||||
|
||||
#include <linux/suspend.h>
|
||||
#include <drm/task_barrier.h>
|
||||
@@ -143,6 +144,8 @@ const char *amdgpu_asic_name[] = {
|
||||
"LAST",
|
||||
};
|
||||
|
||||
static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
|
||||
|
||||
/**
|
||||
* DOC: pcie_replay_count
|
||||
*
|
||||
@@ -335,16 +338,93 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)
|
||||
*
|
||||
* @dev: drm_device pointer
|
||||
*
|
||||
* Returns true if the device supporte BACO,
|
||||
* otherwise return false.
|
||||
* Return:
|
||||
* 1 if the device supporte BACO;
|
||||
* 3 if the device support MACO (only works if BACO is supported)
|
||||
* otherwise return 0.
|
||||
*/
|
||||
bool amdgpu_device_supports_baco(struct drm_device *dev)
|
||||
int amdgpu_device_supports_baco(struct drm_device *dev)
|
||||
{
|
||||
struct amdgpu_device *adev = drm_to_adev(dev);
|
||||
|
||||
return amdgpu_asic_supports_baco(adev);
|
||||
}
|
||||
|
||||
void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
|
||||
{
|
||||
struct drm_device *dev;
|
||||
int bamaco_support;
|
||||
|
||||
dev = adev_to_drm(adev);
|
||||
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
|
||||
bamaco_support = amdgpu_device_supports_baco(dev);
|
||||
|
||||
switch (amdgpu_runtime_pm) {
|
||||
case 2:
|
||||
if (bamaco_support & MACO_SUPPORT) {
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
|
||||
dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
|
||||
} else if (bamaco_support == BACO_SUPPORT) {
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
|
||||
dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
if (bamaco_support & BACO_SUPPORT) {
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
|
||||
dev_info(adev->dev, "Forcing BACO for runtime pm\n");
|
||||
}
|
||||
break;
|
||||
case -1:
|
||||
case -2:
|
||||
if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
|
||||
dev_info(adev->dev, "Using ATPX for runtime pm\n");
|
||||
} else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
|
||||
dev_info(adev->dev, "Using BOCO for runtime pm\n");
|
||||
} else {
|
||||
if (!bamaco_support)
|
||||
goto no_runtime_pm;
|
||||
|
||||
switch (adev->asic_type) {
|
||||
case CHIP_VEGA20:
|
||||
case CHIP_ARCTURUS:
|
||||
/* BACO are not supported on vega20 and arctrus */
|
||||
break;
|
||||
case CHIP_VEGA10:
|
||||
/* enable BACO as runpm mode if noretry=0 */
|
||||
if (!adev->gmc.noretry)
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
|
||||
break;
|
||||
default:
|
||||
/* enable BACO as runpm mode on CI+ */
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
|
||||
break;
|
||||
}
|
||||
|
||||
if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
|
||||
if (bamaco_support & MACO_SUPPORT) {
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
|
||||
dev_info(adev->dev, "Using BAMACO for runtime pm\n");
|
||||
} else {
|
||||
dev_info(adev->dev, "Using BACO for runtime pm\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 0:
|
||||
dev_info(adev->dev, "runtime pm is manually disabled\n");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
no_runtime_pm:
|
||||
if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
|
||||
dev_info(adev->dev, "Runtime PM not available\n");
|
||||
}
|
||||
/**
|
||||
* amdgpu_device_supports_smart_shift - Is the device dGPU with
|
||||
* smart shift support
|
||||
@@ -1402,13 +1482,17 @@ static int amdgpu_device_wb_init(struct amdgpu_device *adev)
|
||||
*/
|
||||
int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
|
||||
{
|
||||
unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
|
||||
unsigned long flags, offset;
|
||||
|
||||
spin_lock_irqsave(&adev->wb.lock, flags);
|
||||
offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
|
||||
if (offset < adev->wb.num_wb) {
|
||||
__set_bit(offset, adev->wb.used);
|
||||
spin_unlock_irqrestore(&adev->wb.lock, flags);
|
||||
*wb = offset << 3; /* convert to dw offset */
|
||||
return 0;
|
||||
} else {
|
||||
spin_unlock_irqrestore(&adev->wb.lock, flags);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
@@ -1423,9 +1507,13 @@ int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
|
||||
*/
|
||||
void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
wb >>= 3;
|
||||
spin_lock_irqsave(&adev->wb.lock, flags);
|
||||
if (wb < adev->wb.num_wb)
|
||||
__clear_bit(wb, adev->wb.used);
|
||||
spin_unlock_irqrestore(&adev->wb.lock, flags);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1455,7 +1543,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
|
||||
|
||||
/* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
|
||||
if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
|
||||
DRM_WARN("System can't access extended configuration space,please check!!\n");
|
||||
DRM_WARN("System can't access extended configuration space, please check!!\n");
|
||||
|
||||
/* skip if the bios has already enabled large BAR */
|
||||
if (adev->gmc.real_vram_size &&
|
||||
@@ -3981,6 +4069,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
spin_lock_init(&adev->se_cac_idx_lock);
|
||||
spin_lock_init(&adev->audio_endpt_idx_lock);
|
||||
spin_lock_init(&adev->mm_stats.lock);
|
||||
spin_lock_init(&adev->wb.lock);
|
||||
|
||||
INIT_LIST_HEAD(&adev->shadow_list);
|
||||
mutex_init(&adev->shadow_list_lock);
|
||||
@@ -4069,6 +4158,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
/* Enable TMZ based on IP_VERSION */
|
||||
amdgpu_gmc_tmz_set(adev);
|
||||
|
||||
if (amdgpu_sriov_vf(adev) &&
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
|
||||
/* VF MMIO access (except mailbox range) from CPU
|
||||
* will be blocked during sriov runtime
|
||||
*/
|
||||
adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
|
||||
|
||||
amdgpu_gmc_noretry_set(adev);
|
||||
/* Need to get xgmi info early to decide the reset behavior*/
|
||||
if (adev->gmc.xgmi.supported) {
|
||||
@@ -4974,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
|
||||
retry:
|
||||
amdgpu_amdkfd_pre_reset(adev);
|
||||
|
||||
amdgpu_device_stop_pending_resets(adev);
|
||||
|
||||
if (from_hypervisor)
|
||||
r = amdgpu_virt_request_full_gpu(adev, true);
|
||||
else
|
||||
r = amdgpu_virt_reset_gpu(adev);
|
||||
if (r)
|
||||
return r;
|
||||
amdgpu_ras_set_fed(adev, false);
|
||||
amdgpu_irq_gpu_reset_resume_helper(adev);
|
||||
|
||||
/* some sw clean up VF needs to do before recover */
|
||||
@@ -5263,11 +5362,21 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
|
||||
struct amdgpu_device *tmp_adev = NULL;
|
||||
bool need_full_reset, skip_hw_reset, vram_lost = false;
|
||||
int r = 0;
|
||||
uint32_t i;
|
||||
|
||||
/* Try reset handler method first */
|
||||
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
||||
reset_list);
|
||||
amdgpu_reset_reg_dumps(tmp_adev);
|
||||
|
||||
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
|
||||
amdgpu_reset_reg_dumps(tmp_adev);
|
||||
|
||||
/* Trigger ip dump before we reset the asic */
|
||||
for (i = 0; i < tmp_adev->num_ip_blocks; i++)
|
||||
if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
|
||||
tmp_adev->ip_blocks[i].version->funcs
|
||||
->dump_ip_state((void *)tmp_adev);
|
||||
}
|
||||
|
||||
reset_context->reset_device_list = device_list_handle;
|
||||
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
|
||||
@@ -5340,7 +5449,8 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
|
||||
|
||||
vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
|
||||
|
||||
amdgpu_coredump(tmp_adev, vram_lost, reset_context);
|
||||
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
|
||||
amdgpu_coredump(tmp_adev, vram_lost, reset_context);
|
||||
|
||||
if (vram_lost) {
|
||||
DRM_INFO("VRAM is lost due to GPU reset!\n");
|
||||
@@ -5538,6 +5648,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
|
||||
|
||||
}
|
||||
|
||||
static int amdgpu_device_health_check(struct list_head *device_list_handle)
|
||||
{
|
||||
struct amdgpu_device *tmp_adev;
|
||||
int ret = 0;
|
||||
u32 status;
|
||||
|
||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||
pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
|
||||
if (PCI_POSSIBLE_ERROR(status)) {
|
||||
dev_err(tmp_adev->dev, "device lost from bus!");
|
||||
ret = -ENODEV;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
|
||||
*
|
||||
@@ -5609,6 +5736,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
device_list_handle = &device_list;
|
||||
}
|
||||
|
||||
if (!amdgpu_sriov_vf(adev)) {
|
||||
r = amdgpu_device_health_check(device_list_handle);
|
||||
if (r)
|
||||
goto end_reset;
|
||||
}
|
||||
|
||||
/* We need to lock reset domain only once both for XGMI and single device */
|
||||
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
||||
reset_list);
|
||||
@@ -5691,11 +5824,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||
tmp_adev->asic_reset_res = r;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop all pending non scheduler resets. Scheduler resets
|
||||
* were already dropped during drm_sched_stop
|
||||
*/
|
||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||
if (!amdgpu_sriov_vf(tmp_adev))
|
||||
/*
|
||||
* Drop all pending non scheduler resets. Scheduler resets
|
||||
* were already dropped during drm_sched_stop
|
||||
*/
|
||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||
}
|
||||
|
||||
/* Actual ASIC resets if needed.*/
|
||||
@@ -5774,6 +5908,7 @@ skip_sched_resume:
|
||||
reset_list);
|
||||
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
|
||||
|
||||
end_reset:
|
||||
if (hive) {
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
|
||||
@@ -97,6 +97,7 @@
|
||||
#include "smuio_v13_0.h"
|
||||
#include "smuio_v13_0_3.h"
|
||||
#include "smuio_v13_0_6.h"
|
||||
#include "smuio_v14_0_2.h"
|
||||
#include "vcn_v5_0_0.h"
|
||||
#include "jpeg_v5_0_0.h"
|
||||
|
||||
@@ -245,6 +246,9 @@ static int amdgpu_discovery_read_binary_from_sysmem(struct amdgpu_device *adev,
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
#define IP_DISCOVERY_V2 2
|
||||
#define IP_DISCOVERY_V4 4
|
||||
|
||||
static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
|
||||
uint8_t *binary)
|
||||
{
|
||||
@@ -259,14 +263,14 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
|
||||
* wait for this to complete. Once the C2PMSG is updated, we can
|
||||
* continue.
|
||||
*/
|
||||
if (dev_is_removable(&adev->pdev->dev)) {
|
||||
for (i = 0; i < 1000; i++) {
|
||||
msg = RREG32(mmMP0_SMN_C2PMSG_33);
|
||||
if (msg & 0x80000000)
|
||||
break;
|
||||
msleep(1);
|
||||
}
|
||||
|
||||
for (i = 0; i < 1000; i++) {
|
||||
msg = RREG32(mmMP0_SMN_C2PMSG_33);
|
||||
if (msg & 0x80000000)
|
||||
break;
|
||||
usleep_range(1000, 1100);
|
||||
}
|
||||
|
||||
vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20;
|
||||
|
||||
if (vram_size) {
|
||||
@@ -1897,6 +1901,8 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev)
|
||||
break;
|
||||
case IP_VERSION(14, 0, 0):
|
||||
case IP_VERSION(14, 0, 1):
|
||||
case IP_VERSION(14, 0, 2):
|
||||
case IP_VERSION(14, 0, 3):
|
||||
amdgpu_device_ip_block_add(adev, &smu_v14_0_ip_block);
|
||||
break;
|
||||
default:
|
||||
@@ -2678,6 +2684,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
|
||||
case IP_VERSION(14, 0, 1):
|
||||
adev->smuio.funcs = &smuio_v13_0_6_funcs;
|
||||
break;
|
||||
case IP_VERSION(14, 0, 2):
|
||||
adev->smuio.funcs = &smuio_v14_0_2_funcs;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -925,7 +925,7 @@ module_param_named(freesync_video, amdgpu_freesync_vid_mode, uint, 0444);
|
||||
* GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)
|
||||
*/
|
||||
MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco/bamaco)");
|
||||
module_param_named(reset_method, amdgpu_reset_method, int, 0444);
|
||||
module_param_named(reset_method, amdgpu_reset_method, int, 0644);
|
||||
|
||||
/**
|
||||
* DOC: bad_page_threshold (int) Bad page threshold is specifies the
|
||||
@@ -2481,6 +2481,7 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
|
||||
|
||||
/* Use a common context, just need to make sure full reset is done */
|
||||
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
|
||||
set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
|
||||
r = amdgpu_do_asic_reset(&device_list, &reset_context);
|
||||
|
||||
if (r) {
|
||||
@@ -2744,7 +2745,8 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev)
|
||||
drm_dev->switch_power_state = DRM_SWITCH_POWER_DYNAMIC_OFF;
|
||||
} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BOCO) {
|
||||
/* nothing to do */
|
||||
} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
|
||||
} else if ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
|
||||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)) {
|
||||
amdgpu_device_baco_enter(drm_dev);
|
||||
}
|
||||
|
||||
@@ -2784,7 +2786,8 @@ static int amdgpu_pmops_runtime_resume(struct device *dev)
|
||||
* PCI core handles it for _PR3.
|
||||
*/
|
||||
pci_set_master(pdev);
|
||||
} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
|
||||
} else if ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
|
||||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)) {
|
||||
amdgpu_device_baco_exit(drm_dev);
|
||||
}
|
||||
ret = amdgpu_device_resume(drm_dev, false);
|
||||
|
||||
@@ -1206,7 +1206,8 @@ void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev,
|
||||
fw_size = le32_to_cpu(cp_hdr_v2_0->data_size_bytes);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
dev_err(adev->dev, "Invalid ucode id %u\n", ucode_id);
|
||||
return;
|
||||
}
|
||||
|
||||
if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
|
||||
|
||||
@@ -259,7 +259,6 @@ struct amdgpu_cu_info {
|
||||
struct amdgpu_gfx_ras {
|
||||
struct amdgpu_ras_block_object ras_block;
|
||||
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
|
||||
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
|
||||
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *source,
|
||||
struct amdgpu_iv_entry *entry);
|
||||
@@ -434,6 +433,10 @@ struct amdgpu_gfx {
|
||||
uint32_t num_xcc_per_xcp;
|
||||
struct mutex partition_mutex;
|
||||
bool mcbp; /* mid command buffer preemption */
|
||||
|
||||
/* IP reg dump */
|
||||
uint32_t *ip_dump;
|
||||
uint32_t reg_count;
|
||||
};
|
||||
|
||||
struct amdgpu_gfx_ras_reg_entry {
|
||||
|
||||
@@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs {
|
||||
void (*mode2_save_regs)(struct amdgpu_device *adev);
|
||||
void (*mode2_restore_regs)(struct amdgpu_device *adev);
|
||||
void (*halt)(struct amdgpu_device *adev);
|
||||
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
|
||||
int xcc_id);
|
||||
};
|
||||
|
||||
struct amdgpu_gfxhub {
|
||||
|
||||
@@ -279,7 +279,7 @@ amdgpu_i2c_lookup(struct amdgpu_device *adev,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void amdgpu_i2c_get_byte(struct amdgpu_i2c_chan *i2c_bus,
|
||||
static int amdgpu_i2c_get_byte(struct amdgpu_i2c_chan *i2c_bus,
|
||||
u8 slave_addr,
|
||||
u8 addr,
|
||||
u8 *val)
|
||||
@@ -304,16 +304,18 @@ static void amdgpu_i2c_get_byte(struct amdgpu_i2c_chan *i2c_bus,
|
||||
out_buf[0] = addr;
|
||||
out_buf[1] = 0;
|
||||
|
||||
if (i2c_transfer(&i2c_bus->adapter, msgs, 2) == 2) {
|
||||
*val = in_buf[0];
|
||||
DRM_DEBUG("val = 0x%02x\n", *val);
|
||||
} else {
|
||||
DRM_DEBUG("i2c 0x%02x 0x%02x read failed\n",
|
||||
addr, *val);
|
||||
if (i2c_transfer(&i2c_bus->adapter, msgs, 2) != 2) {
|
||||
DRM_DEBUG("i2c 0x%02x read failed\n", addr);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
*val = in_buf[0];
|
||||
DRM_DEBUG("val = 0x%02x\n", *val);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus,
|
||||
static int amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus,
|
||||
u8 slave_addr,
|
||||
u8 addr,
|
||||
u8 val)
|
||||
@@ -329,9 +331,12 @@ static void amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus,
|
||||
out_buf[0] = addr;
|
||||
out_buf[1] = val;
|
||||
|
||||
if (i2c_transfer(&i2c_bus->adapter, &msg, 1) != 1)
|
||||
DRM_DEBUG("i2c 0x%02x 0x%02x write failed\n",
|
||||
addr, val);
|
||||
if (i2c_transfer(&i2c_bus->adapter, &msg, 1) != 1) {
|
||||
DRM_DEBUG("i2c 0x%02x 0x%02x write failed\n", addr, val);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ddc router switching */
|
||||
@@ -346,16 +351,18 @@ amdgpu_i2c_router_select_ddc_port(const struct amdgpu_connector *amdgpu_connecto
|
||||
if (!amdgpu_connector->router_bus)
|
||||
return;
|
||||
|
||||
amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
|
||||
if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
|
||||
amdgpu_connector->router.i2c_addr,
|
||||
0x3, &val);
|
||||
0x3, &val))
|
||||
return;
|
||||
val &= ~amdgpu_connector->router.ddc_mux_control_pin;
|
||||
amdgpu_i2c_put_byte(amdgpu_connector->router_bus,
|
||||
amdgpu_connector->router.i2c_addr,
|
||||
0x3, val);
|
||||
amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
|
||||
if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
|
||||
amdgpu_connector->router.i2c_addr,
|
||||
0x1, &val);
|
||||
0x1, &val))
|
||||
return;
|
||||
val &= ~amdgpu_connector->router.ddc_mux_control_pin;
|
||||
val |= amdgpu_connector->router.ddc_mux_state;
|
||||
amdgpu_i2c_put_byte(amdgpu_connector->router_bus,
|
||||
@@ -375,16 +382,18 @@ amdgpu_i2c_router_select_cd_port(const struct amdgpu_connector *amdgpu_connector
|
||||
if (!amdgpu_connector->router_bus)
|
||||
return;
|
||||
|
||||
amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
|
||||
if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
|
||||
amdgpu_connector->router.i2c_addr,
|
||||
0x3, &val);
|
||||
0x3, &val))
|
||||
return;
|
||||
val &= ~amdgpu_connector->router.cd_mux_control_pin;
|
||||
amdgpu_i2c_put_byte(amdgpu_connector->router_bus,
|
||||
amdgpu_connector->router.i2c_addr,
|
||||
0x3, val);
|
||||
amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
|
||||
if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
|
||||
amdgpu_connector->router.i2c_addr,
|
||||
0x1, &val);
|
||||
0x1, &val))
|
||||
return;
|
||||
val &= ~amdgpu_connector->router.cd_mux_control_pin;
|
||||
val |= amdgpu_connector->router.cd_mux_state;
|
||||
amdgpu_i2c_put_byte(amdgpu_connector->router_bus,
|
||||
|
||||
@@ -445,6 +445,14 @@ void amdgpu_irq_dispatch(struct amdgpu_device *adev,
|
||||
|
||||
entry.ih = ih;
|
||||
entry.iv_entry = (const uint32_t *)&ih->ring[ring_index];
|
||||
|
||||
/*
|
||||
* timestamp is not supported on some legacy SOCs (cik, cz, iceland,
|
||||
* si and tonga), so initialize timestamp and timestamp_src to 0
|
||||
*/
|
||||
entry.timestamp = 0;
|
||||
entry.timestamp_src = 0;
|
||||
|
||||
amdgpu_ih_decode_iv(adev, &entry);
|
||||
|
||||
trace_amdgpu_iv(ih - &adev->irq.ih, &entry);
|
||||
|
||||
@@ -149,38 +149,7 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags)
|
||||
goto out;
|
||||
}
|
||||
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
|
||||
if (amdgpu_device_supports_px(dev) &&
|
||||
(amdgpu_runtime_pm != 0)) { /* enable PX as runtime mode */
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
|
||||
dev_info(adev->dev, "Using ATPX for runtime pm\n");
|
||||
} else if (amdgpu_device_supports_boco(dev) &&
|
||||
(amdgpu_runtime_pm != 0)) { /* enable boco as runtime mode */
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
|
||||
dev_info(adev->dev, "Using BOCO for runtime pm\n");
|
||||
} else if (amdgpu_device_supports_baco(dev) &&
|
||||
(amdgpu_runtime_pm != 0)) {
|
||||
switch (adev->asic_type) {
|
||||
case CHIP_VEGA20:
|
||||
case CHIP_ARCTURUS:
|
||||
/* enable BACO as runpm mode if runpm=1 */
|
||||
if (amdgpu_runtime_pm > 0)
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
|
||||
break;
|
||||
case CHIP_VEGA10:
|
||||
/* enable BACO as runpm mode if noretry=0 */
|
||||
if (!adev->gmc.noretry)
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
|
||||
break;
|
||||
default:
|
||||
/* enable BACO as runpm mode on CI+ */
|
||||
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
|
||||
break;
|
||||
}
|
||||
|
||||
if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO)
|
||||
dev_info(adev->dev, "Using BACO for runtime pm\n");
|
||||
}
|
||||
amdgpu_device_detect_runtime_pm_mode(adev);
|
||||
|
||||
/* Call ACPI methods: require modeset init
|
||||
* but failure is not fatal
|
||||
|
||||
@@ -210,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry)
|
||||
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n");
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_STATUS]);
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_ADDR]);
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_MISC0]);
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_IPID]);
|
||||
dev_info(adev->dev, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_SYND]);
|
||||
u64 event_id = qctx->event_id;
|
||||
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_STATUS]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_ADDR]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_MISC0]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_IPID]);
|
||||
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
|
||||
idx, entry->regs[MCA_REG_IDX_SYND]);
|
||||
}
|
||||
|
||||
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
|
||||
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
|
||||
struct ras_err_data *err_data, struct ras_query_context *qctx)
|
||||
{
|
||||
struct amdgpu_smuio_mcm_config_info mcm_info;
|
||||
struct ras_err_addr err_addr = {0};
|
||||
@@ -244,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
|
||||
list_for_each_entry(node, &mca_set.list, node) {
|
||||
entry = &node->entry;
|
||||
|
||||
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry);
|
||||
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);
|
||||
|
||||
count = 0;
|
||||
ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);
|
||||
|
||||
@@ -169,6 +169,7 @@ void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root
|
||||
void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set);
|
||||
int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry);
|
||||
void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set);
|
||||
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data);
|
||||
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
|
||||
struct ras_err_data *err_data, struct ras_query_context *qctx);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -32,6 +32,18 @@
|
||||
#define AMDGPU_MES_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
|
||||
#define AMDGPU_ONE_DOORBELL_SIZE 8
|
||||
|
||||
signed long amdgpu_mes_fence_wait_polling(u64 *fence,
|
||||
u64 wait_seq,
|
||||
signed long timeout)
|
||||
{
|
||||
|
||||
while ((s64)(wait_seq - *fence) > 0 && timeout > 0) {
|
||||
udelay(2);
|
||||
timeout -= 2;
|
||||
}
|
||||
return timeout > 0 ? timeout : 0;
|
||||
}
|
||||
|
||||
int amdgpu_mes_doorbell_process_slice(struct amdgpu_device *adev)
|
||||
{
|
||||
return roundup(AMDGPU_ONE_DOORBELL_SIZE *
|
||||
@@ -40,7 +52,6 @@ int amdgpu_mes_doorbell_process_slice(struct amdgpu_device *adev)
|
||||
}
|
||||
|
||||
static int amdgpu_mes_kernel_doorbell_get(struct amdgpu_device *adev,
|
||||
struct amdgpu_mes_process *process,
|
||||
int ip_type, uint64_t *doorbell_index)
|
||||
{
|
||||
unsigned int offset, found;
|
||||
@@ -65,7 +76,6 @@ static int amdgpu_mes_kernel_doorbell_get(struct amdgpu_device *adev,
|
||||
}
|
||||
|
||||
static void amdgpu_mes_kernel_doorbell_free(struct amdgpu_device *adev,
|
||||
struct amdgpu_mes_process *process,
|
||||
uint32_t doorbell_index)
|
||||
{
|
||||
unsigned int old, rel_index;
|
||||
@@ -656,7 +666,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
|
||||
*queue_id = queue->queue_id = r;
|
||||
|
||||
/* allocate a doorbell index for the queue */
|
||||
r = amdgpu_mes_kernel_doorbell_get(adev, gang->process,
|
||||
r = amdgpu_mes_kernel_doorbell_get(adev,
|
||||
qprops->queue_type,
|
||||
&qprops->doorbell_off);
|
||||
if (r)
|
||||
@@ -714,8 +724,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
|
||||
return 0;
|
||||
|
||||
clean_up_doorbell:
|
||||
amdgpu_mes_kernel_doorbell_free(adev, gang->process,
|
||||
qprops->doorbell_off);
|
||||
amdgpu_mes_kernel_doorbell_free(adev, qprops->doorbell_off);
|
||||
clean_up_queue_id:
|
||||
spin_lock_irqsave(&adev->mes.queue_id_lock, flags);
|
||||
idr_remove(&adev->mes.queue_id_idr, queue->queue_id);
|
||||
@@ -769,8 +778,7 @@ int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id)
|
||||
queue_id);
|
||||
|
||||
list_del(&queue->list);
|
||||
amdgpu_mes_kernel_doorbell_free(adev, gang->process,
|
||||
queue->doorbell_off);
|
||||
amdgpu_mes_kernel_doorbell_free(adev, queue->doorbell_off);
|
||||
amdgpu_mes_unlock(&adev->mes);
|
||||
|
||||
amdgpu_mes_queue_free_mqd(queue);
|
||||
@@ -778,6 +786,28 @@ int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev,
|
||||
struct amdgpu_ring *ring)
|
||||
{
|
||||
struct mes_map_legacy_queue_input queue_input;
|
||||
int r;
|
||||
|
||||
memset(&queue_input, 0, sizeof(queue_input));
|
||||
|
||||
queue_input.queue_type = ring->funcs->type;
|
||||
queue_input.doorbell_offset = ring->doorbell_index;
|
||||
queue_input.pipe_id = ring->pipe;
|
||||
queue_input.queue_id = ring->queue;
|
||||
queue_input.mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj);
|
||||
queue_input.wptr_addr = ring->wptr_gpu_addr;
|
||||
|
||||
r = adev->mes.funcs->map_legacy_queue(&adev->mes, &queue_input);
|
||||
if (r)
|
||||
DRM_ERROR("failed to map legacy queue\n");
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,
|
||||
struct amdgpu_ring *ring,
|
||||
enum amdgpu_unmap_queues_action action,
|
||||
@@ -1475,7 +1505,7 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe)
|
||||
const struct mes_firmware_header_v1_0 *mes_hdr;
|
||||
struct amdgpu_firmware_info *info;
|
||||
char ucode_prefix[30];
|
||||
char fw_name[40];
|
||||
char fw_name[50];
|
||||
bool need_retry = false;
|
||||
int r;
|
||||
|
||||
|
||||
@@ -141,6 +141,12 @@ struct amdgpu_mes {
|
||||
|
||||
/* ip specific functions */
|
||||
const struct amdgpu_mes_funcs *funcs;
|
||||
|
||||
/* mes resource_1 bo*/
|
||||
struct amdgpu_bo *resource_1;
|
||||
uint64_t resource_1_gpu_addr;
|
||||
void *resource_1_addr;
|
||||
|
||||
};
|
||||
|
||||
struct amdgpu_mes_process {
|
||||
@@ -242,6 +248,15 @@ struct mes_remove_queue_input {
|
||||
uint64_t gang_context_addr;
|
||||
};
|
||||
|
||||
struct mes_map_legacy_queue_input {
|
||||
uint32_t queue_type;
|
||||
uint32_t doorbell_offset;
|
||||
uint32_t pipe_id;
|
||||
uint32_t queue_id;
|
||||
uint64_t mqd_addr;
|
||||
uint64_t wptr_addr;
|
||||
};
|
||||
|
||||
struct mes_unmap_legacy_queue_input {
|
||||
enum amdgpu_unmap_queues_action action;
|
||||
uint32_t queue_type;
|
||||
@@ -318,6 +333,9 @@ struct amdgpu_mes_funcs {
|
||||
int (*remove_hw_queue)(struct amdgpu_mes *mes,
|
||||
struct mes_remove_queue_input *input);
|
||||
|
||||
int (*map_legacy_queue)(struct amdgpu_mes *mes,
|
||||
struct mes_map_legacy_queue_input *input);
|
||||
|
||||
int (*unmap_legacy_queue)(struct amdgpu_mes *mes,
|
||||
struct mes_unmap_legacy_queue_input *input);
|
||||
|
||||
@@ -334,6 +352,10 @@ struct amdgpu_mes_funcs {
|
||||
#define amdgpu_mes_kiq_hw_init(adev) (adev)->mes.kiq_hw_init((adev))
|
||||
#define amdgpu_mes_kiq_hw_fini(adev) (adev)->mes.kiq_hw_fini((adev))
|
||||
|
||||
signed long amdgpu_mes_fence_wait_polling(u64 *fence,
|
||||
u64 wait_seq,
|
||||
signed long timeout);
|
||||
|
||||
int amdgpu_mes_ctx_get_offs(struct amdgpu_ring *ring, unsigned int id_offs);
|
||||
|
||||
int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe);
|
||||
@@ -357,6 +379,8 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
|
||||
int *queue_id);
|
||||
int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id);
|
||||
|
||||
int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev,
|
||||
struct amdgpu_ring *ring);
|
||||
int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,
|
||||
struct amdgpu_ring *ring,
|
||||
enum amdgpu_unmap_queues_action action,
|
||||
|
||||
@@ -63,6 +63,8 @@ struct amdgpu_mmhub_funcs {
|
||||
uint64_t page_table_base);
|
||||
void (*update_power_gating)(struct amdgpu_device *adev,
|
||||
bool enable);
|
||||
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
|
||||
int hub_inst);
|
||||
};
|
||||
|
||||
struct amdgpu_mmhub {
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_trace.h"
|
||||
#include "amdgpu_amdkfd.h"
|
||||
#include "amdgpu_vram_mgr.h"
|
||||
|
||||
/**
|
||||
* DOC: amdgpu_object
|
||||
@@ -153,8 +154,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
|
||||
else
|
||||
places[c].flags |= TTM_PL_FLAG_TOPDOWN;
|
||||
|
||||
if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
|
||||
if (abo->tbo.type == ttm_bo_type_kernel &&
|
||||
flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
|
||||
places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
|
||||
|
||||
c++;
|
||||
}
|
||||
|
||||
@@ -173,6 +176,12 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
|
||||
abo->flags & AMDGPU_GEM_CREATE_PREEMPTIBLE ?
|
||||
AMDGPU_PL_PREEMPT : TTM_PL_TT;
|
||||
places[c].flags = 0;
|
||||
/*
|
||||
* When GTT is just an alternative to VRAM make sure that we
|
||||
* only use it as fallback and still try to fill up VRAM first.
|
||||
*/
|
||||
if (domain & abo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)
|
||||
places[c].flags |= TTM_PL_FLAG_FALLBACK;
|
||||
c++;
|
||||
}
|
||||
|
||||
@@ -595,8 +604,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
|
||||
if (!amdgpu_bo_support_uswc(bo->flags))
|
||||
bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
|
||||
|
||||
if (adev->ras_enabled)
|
||||
bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
|
||||
bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
|
||||
|
||||
bo->tbo.bdev = &adev->mman.bdev;
|
||||
if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
|
||||
@@ -629,7 +637,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
|
||||
bo->tbo.resource->mem_type == TTM_PL_VRAM) {
|
||||
struct dma_fence *fence;
|
||||
|
||||
r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, &fence, true);
|
||||
r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, &fence);
|
||||
if (unlikely(r))
|
||||
goto fail_unreserve;
|
||||
|
||||
@@ -759,7 +767,7 @@ int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence)
|
||||
|
||||
return amdgpu_copy_buffer(ring, shadow_addr, parent_addr,
|
||||
amdgpu_bo_size(shadow), NULL, fence,
|
||||
true, false, false);
|
||||
true, false, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -961,6 +969,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
|
||||
if (!bo->placements[i].lpfn ||
|
||||
(lpfn && lpfn < bo->placements[i].lpfn))
|
||||
bo->placements[i].lpfn = lpfn;
|
||||
|
||||
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
|
||||
bo->placements[i].mem_type == TTM_PL_VRAM)
|
||||
bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
|
||||
}
|
||||
|
||||
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
|
||||
@@ -1366,8 +1378,9 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
|
||||
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
|
||||
return;
|
||||
|
||||
r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, &fence, true);
|
||||
r = amdgpu_fill_buffer(abo, 0, bo->base.resv, &fence, true);
|
||||
if (!WARN_ON(r)) {
|
||||
amdgpu_vram_mgr_set_cleared(bo->resource);
|
||||
amdgpu_bo_fence(abo, fence, false);
|
||||
dma_fence_put(fence);
|
||||
}
|
||||
|
||||
@@ -1053,6 +1053,11 @@ static int psp_asd_initialize(struct psp_context *psp)
|
||||
if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)
|
||||
return 0;
|
||||
|
||||
/* bypass asd if display hardware is not available */
|
||||
if (!amdgpu_device_has_display_hardware(psp->adev) &&
|
||||
amdgpu_ip_version(psp->adev, MP0_HWIP, 0) >= IP_VERSION(13, 0, 10))
|
||||
return 0;
|
||||
|
||||
psp->asd_context.mem_context.shared_mc_addr = 0;
|
||||
psp->asd_context.mem_context.shared_mem_size = PSP_ASD_SHARED_MEM_SIZE;
|
||||
psp->asd_context.ta_load_type = GFX_CMD_ID_LOAD_ASD;
|
||||
@@ -2260,6 +2265,15 @@ static int psp_hw_start(struct psp_context *psp)
|
||||
}
|
||||
}
|
||||
|
||||
if ((is_psp_fw_valid(psp->ipkeymgr_drv)) &&
|
||||
(psp->funcs->bootloader_load_ipkeymgr_drv != NULL)) {
|
||||
ret = psp_bootloader_load_ipkeymgr_drv(psp);
|
||||
if (ret) {
|
||||
dev_err(adev->dev, "PSP load ipkeymgr_drv failed!\n");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
if ((is_psp_fw_valid(psp->sos)) &&
|
||||
(psp->funcs->bootloader_load_sos != NULL)) {
|
||||
ret = psp_bootloader_load_sos(psp);
|
||||
@@ -2617,7 +2631,8 @@ static int psp_load_p2s_table(struct psp_context *psp)
|
||||
struct amdgpu_firmware_info *ucode =
|
||||
&adev->firmware.ucode[AMDGPU_UCODE_ID_P2S_TABLE];
|
||||
|
||||
if (adev->in_runpm && (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO))
|
||||
if (adev->in_runpm && ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
|
||||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)))
|
||||
return 0;
|
||||
|
||||
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
|
||||
@@ -2647,7 +2662,8 @@ static int psp_load_smu_fw(struct psp_context *psp)
|
||||
* Skip SMU FW reloading in case of using BACO for runpm only,
|
||||
* as SMU is always alive.
|
||||
*/
|
||||
if (adev->in_runpm && (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO))
|
||||
if (adev->in_runpm && ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
|
||||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)))
|
||||
return 0;
|
||||
|
||||
if (!ucode->fw || amdgpu_sriov_vf(psp->adev))
|
||||
@@ -3273,6 +3289,12 @@ static int parse_sos_bin_descriptor(struct psp_context *psp,
|
||||
psp->ras_drv.size_bytes = le32_to_cpu(desc->size_bytes);
|
||||
psp->ras_drv.start_addr = ucode_start_addr;
|
||||
break;
|
||||
case PSP_FW_TYPE_PSP_IPKEYMGR_DRV:
|
||||
psp->ipkeymgr_drv.fw_version = le32_to_cpu(desc->fw_version);
|
||||
psp->ipkeymgr_drv.feature_version = le32_to_cpu(desc->fw_version);
|
||||
psp->ipkeymgr_drv.size_bytes = le32_to_cpu(desc->size_bytes);
|
||||
psp->ipkeymgr_drv.start_addr = ucode_start_addr;
|
||||
break;
|
||||
default:
|
||||
dev_warn(psp->adev->dev, "Unsupported PSP FW type: %d\n", desc->fw_type);
|
||||
break;
|
||||
|
||||
@@ -73,8 +73,10 @@ enum psp_bootloader_cmd {
|
||||
PSP_BL__LOAD_KEY_DATABASE = 0x80000,
|
||||
PSP_BL__LOAD_SOCDRV = 0xB0000,
|
||||
PSP_BL__LOAD_DBGDRV = 0xC0000,
|
||||
PSP_BL__LOAD_HADDRV = PSP_BL__LOAD_DBGDRV,
|
||||
PSP_BL__LOAD_INTFDRV = 0xD0000,
|
||||
PSP_BL__LOAD_RASDRV = 0xE0000,
|
||||
PSP_BL__LOAD_RASDRV = 0xE0000,
|
||||
PSP_BL__LOAD_IPKEYMGRDRV = 0xF0000,
|
||||
PSP_BL__DRAM_LONG_TRAIN = 0x100000,
|
||||
PSP_BL__DRAM_SHORT_TRAIN = 0x200000,
|
||||
PSP_BL__LOAD_TOS_SPL_TABLE = 0x10000000,
|
||||
@@ -117,6 +119,7 @@ struct psp_funcs {
|
||||
int (*bootloader_load_intf_drv)(struct psp_context *psp);
|
||||
int (*bootloader_load_dbg_drv)(struct psp_context *psp);
|
||||
int (*bootloader_load_ras_drv)(struct psp_context *psp);
|
||||
int (*bootloader_load_ipkeymgr_drv)(struct psp_context *psp);
|
||||
int (*bootloader_load_sos)(struct psp_context *psp);
|
||||
int (*ring_create)(struct psp_context *psp,
|
||||
enum psp_ring_type ring_type);
|
||||
@@ -336,6 +339,7 @@ struct psp_context {
|
||||
struct psp_bin_desc intf_drv;
|
||||
struct psp_bin_desc dbg_drv;
|
||||
struct psp_bin_desc ras_drv;
|
||||
struct psp_bin_desc ipkeymgr_drv;
|
||||
|
||||
/* tmr buffer */
|
||||
struct amdgpu_bo *tmr_bo;
|
||||
@@ -424,6 +428,9 @@ struct amdgpu_psp_funcs {
|
||||
#define psp_bootloader_load_ras_drv(psp) \
|
||||
((psp)->funcs->bootloader_load_ras_drv ? \
|
||||
(psp)->funcs->bootloader_load_ras_drv((psp)) : 0)
|
||||
#define psp_bootloader_load_ipkeymgr_drv(psp) \
|
||||
((psp)->funcs->bootloader_load_ipkeymgr_drv ? \
|
||||
(psp)->funcs->bootloader_load_ipkeymgr_drv((psp)) : 0)
|
||||
#define psp_bootloader_load_sos(psp) \
|
||||
((psp)->funcs->bootloader_load_sos ? (psp)->funcs->bootloader_load_sos((psp)) : 0)
|
||||
#define psp_smu_reload_quirk(psp) \
|
||||
|
||||
@@ -122,6 +122,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
|
||||
|
||||
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
|
||||
|
||||
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
|
||||
|
||||
enum amdgpu_ras_retire_page_reservation {
|
||||
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
|
||||
AMDGPU_RAS_RETIRE_PAGE_PENDING,
|
||||
@@ -1045,6 +1047,7 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
|
||||
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
|
||||
struct ras_manager *ras_mgr,
|
||||
struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx,
|
||||
const char *blk_name,
|
||||
bool is_ue,
|
||||
bool is_de)
|
||||
@@ -1052,27 +1055,28 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
|
||||
struct amdgpu_smuio_mcm_config_info *mcm_info;
|
||||
struct ras_err_node *err_node;
|
||||
struct ras_err_info *err_info;
|
||||
u64 event_id = qctx->event_id;
|
||||
|
||||
if (is_ue) {
|
||||
for_each_ras_error(err_node, err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
if (err_info->ue_count) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld new uncorrectable hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->ue_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld new uncorrectable hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->ue_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
for_each_ras_error(err_node, &ras_mgr->err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld uncorrectable hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld uncorrectable hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
|
||||
}
|
||||
|
||||
} else {
|
||||
@@ -1081,44 +1085,44 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
if (err_info->de_count) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld new deferred hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->de_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld new deferred hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->de_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
for_each_ras_error(err_node, &ras_mgr->err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld deferred hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id,
|
||||
err_info->de_count, blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld deferred hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id,
|
||||
err_info->de_count, blk_name);
|
||||
}
|
||||
} else {
|
||||
for_each_ras_error(err_node, err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
if (err_info->ce_count) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld new correctable hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->ce_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld new correctable hardware errors detected in %s block\n",
|
||||
mcm_info->socket_id,
|
||||
mcm_info->die_id,
|
||||
err_info->ce_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
for_each_ras_error(err_node, &ras_mgr->err_data) {
|
||||
err_info = &err_node->err_info;
|
||||
mcm_info = &err_info->mcm_info;
|
||||
dev_info(adev->dev, "socket: %d, die: %d, "
|
||||
"%lld correctable hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id,
|
||||
err_info->ce_count, blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
|
||||
"%lld correctable hardware errors detected in total in %s block\n",
|
||||
mcm_info->socket_id, mcm_info->die_id,
|
||||
err_info->ce_count, blk_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1131,77 +1135,79 @@ static inline bool err_data_has_source_info(struct ras_err_data *data)
|
||||
|
||||
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
|
||||
struct ras_query_if *query_if,
|
||||
struct ras_err_data *err_data)
|
||||
struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
|
||||
const char *blk_name = get_ras_block_str(&query_if->head);
|
||||
u64 event_id = qctx->event_id;
|
||||
|
||||
if (err_data->ce_count) {
|
||||
if (err_data_has_source_info(err_data)) {
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
|
||||
blk_name, false, false);
|
||||
} else if (!adev->aid_mask &&
|
||||
adev->smuio.funcs &&
|
||||
adev->smuio.funcs->get_socket_id &&
|
||||
adev->smuio.funcs->get_die_id) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d "
|
||||
"%ld correctable hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.ce_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
|
||||
"%ld correctable hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.ce_count,
|
||||
blk_name);
|
||||
} else {
|
||||
dev_info(adev->dev, "%ld correctable hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.ce_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.ce_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
if (err_data->ue_count) {
|
||||
if (err_data_has_source_info(err_data)) {
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
|
||||
blk_name, true, false);
|
||||
} else if (!adev->aid_mask &&
|
||||
adev->smuio.funcs &&
|
||||
adev->smuio.funcs->get_socket_id &&
|
||||
adev->smuio.funcs->get_die_id) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d "
|
||||
"%ld uncorrectable hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.ue_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
|
||||
"%ld uncorrectable hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.ue_count,
|
||||
blk_name);
|
||||
} else {
|
||||
dev_info(adev->dev, "%ld uncorrectable hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.ue_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.ue_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
|
||||
if (err_data->de_count) {
|
||||
if (err_data_has_source_info(err_data)) {
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
|
||||
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
|
||||
blk_name, false, true);
|
||||
} else if (!adev->aid_mask &&
|
||||
adev->smuio.funcs &&
|
||||
adev->smuio.funcs->get_socket_id &&
|
||||
adev->smuio.funcs->get_die_id) {
|
||||
dev_info(adev->dev, "socket: %d, die: %d "
|
||||
"%ld deferred hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.de_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
|
||||
"%ld deferred hardware errors "
|
||||
"detected in %s block\n",
|
||||
adev->smuio.funcs->get_socket_id(adev),
|
||||
adev->smuio.funcs->get_die_id(adev),
|
||||
ras_mgr->err_data.de_count,
|
||||
blk_name);
|
||||
} else {
|
||||
dev_info(adev->dev, "%ld deferred hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.de_count,
|
||||
blk_name);
|
||||
RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors "
|
||||
"detected in %s block\n",
|
||||
ras_mgr->err_data.de_count,
|
||||
blk_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1244,6 +1250,10 @@ int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
|
||||
{
|
||||
struct ras_manager *obj;
|
||||
|
||||
/* in resume phase, no need to create aca fs node */
|
||||
if (adev->in_suspend || amdgpu_in_reset(adev))
|
||||
return 0;
|
||||
|
||||
obj = get_ras_manager(adev, blk);
|
||||
if (!obj)
|
||||
return -EINVAL;
|
||||
@@ -1265,7 +1275,8 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
|
||||
}
|
||||
|
||||
static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
|
||||
enum aca_error_type type, struct ras_err_data *err_data)
|
||||
enum aca_error_type type, struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx)
|
||||
{
|
||||
struct ras_manager *obj;
|
||||
|
||||
@@ -1273,7 +1284,7 @@ static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu
|
||||
if (!obj)
|
||||
return -EINVAL;
|
||||
|
||||
return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data);
|
||||
return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx);
|
||||
}
|
||||
|
||||
ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
|
||||
@@ -1287,13 +1298,14 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
|
||||
if (amdgpu_ras_query_error_status(obj->adev, &info))
|
||||
return -EINVAL;
|
||||
|
||||
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
|
||||
"ce", info.ce_count);
|
||||
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count,
|
||||
"ce", info.ce_count, "de", info.ue_count);
|
||||
}
|
||||
|
||||
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
|
||||
struct ras_query_if *info,
|
||||
struct ras_err_data *err_data,
|
||||
struct ras_query_context *qctx,
|
||||
unsigned int error_query_mode)
|
||||
{
|
||||
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
|
||||
@@ -1329,17 +1341,21 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
|
||||
}
|
||||
} else {
|
||||
if (amdgpu_aca_is_enabled(adev)) {
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data);
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data);
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
} else {
|
||||
/* FIXME: add code to check return value later */
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1351,6 +1367,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
|
||||
{
|
||||
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
|
||||
struct ras_err_data err_data;
|
||||
struct ras_query_context qctx;
|
||||
unsigned int error_query_mode;
|
||||
int ret;
|
||||
|
||||
@@ -1364,8 +1381,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
|
||||
if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
|
||||
return -EINVAL;
|
||||
|
||||
memset(&qctx, 0, sizeof(qctx));
|
||||
qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
|
||||
RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
|
||||
ret = amdgpu_ras_query_error_status_helper(adev, info,
|
||||
&err_data,
|
||||
&qctx,
|
||||
error_query_mode);
|
||||
if (ret)
|
||||
goto out_fini_err_data;
|
||||
@@ -1376,7 +1397,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
|
||||
info->ce_count = obj->err_data.ce_count;
|
||||
info->de_count = obj->err_data.de_count;
|
||||
|
||||
amdgpu_ras_error_generate_report(adev, info, &err_data);
|
||||
amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
|
||||
|
||||
out_fini_err_data:
|
||||
amdgpu_ras_error_data_fini(&err_data);
|
||||
@@ -2041,7 +2062,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
|
||||
}
|
||||
}
|
||||
|
||||
amdgpu_umc_poison_handler(adev, obj->head.block, false);
|
||||
amdgpu_umc_poison_handler(adev, obj->head.block, 0);
|
||||
|
||||
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
|
||||
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
|
||||
@@ -2061,6 +2082,17 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
|
||||
{
|
||||
dev_info(obj->adev->dev,
|
||||
"Poison is created\n");
|
||||
|
||||
if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
|
||||
|
||||
amdgpu_ras_put_poison_req(obj->adev,
|
||||
AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
|
||||
|
||||
atomic_inc(&con->page_retirement_req_cnt);
|
||||
|
||||
wake_up(&con->page_retirement_wq);
|
||||
}
|
||||
}
|
||||
|
||||
static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
|
||||
@@ -2371,7 +2403,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
|
||||
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
|
||||
};
|
||||
status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
|
||||
data->bps[i].retired_page);
|
||||
data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
|
||||
if (status == -EBUSY)
|
||||
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
|
||||
else if (status == -ENOENT)
|
||||
@@ -2384,6 +2416,19 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
|
||||
struct amdgpu_hive_info *hive, bool status)
|
||||
{
|
||||
struct amdgpu_device *tmp_adev;
|
||||
|
||||
if (hive) {
|
||||
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
|
||||
amdgpu_ras_set_fed(tmp_adev, status);
|
||||
} else {
|
||||
amdgpu_ras_set_fed(adev, status);
|
||||
}
|
||||
}
|
||||
|
||||
static void amdgpu_ras_do_recovery(struct work_struct *work)
|
||||
{
|
||||
struct amdgpu_ras *ras =
|
||||
@@ -2393,8 +2438,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
|
||||
struct list_head device_list, *device_list_handle = NULL;
|
||||
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
|
||||
|
||||
if (hive)
|
||||
if (hive) {
|
||||
atomic_set(&hive->ras_recovery, 1);
|
||||
|
||||
/* If any device which is part of the hive received RAS fatal
|
||||
* error interrupt, set fatal error status on all. This
|
||||
* condition will need a recovery, and flag will be cleared
|
||||
* as part of recovery.
|
||||
*/
|
||||
list_for_each_entry(remote_adev, &hive->device_list,
|
||||
gmc.xgmi.head)
|
||||
if (amdgpu_ras_get_fed_status(remote_adev)) {
|
||||
amdgpu_ras_set_fed_all(adev, hive, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!ras->disable_ras_err_cnt_harvest) {
|
||||
|
||||
/* Build list of devices to query RAS related errors */
|
||||
@@ -2439,18 +2497,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
|
||||
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
|
||||
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
/* For any RAS error that needs a full reset to
|
||||
* recover, set the fatal error status
|
||||
*/
|
||||
if (hive) {
|
||||
list_for_each_entry(remote_adev,
|
||||
&hive->device_list,
|
||||
gmc.xgmi.head)
|
||||
amdgpu_ras_set_fed(remote_adev,
|
||||
true);
|
||||
} else {
|
||||
amdgpu_ras_set_fed(adev, true);
|
||||
}
|
||||
psp_fatal_error_recovery_quirk(&adev->psp);
|
||||
}
|
||||
}
|
||||
@@ -2516,9 +2562,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
|
||||
goto out;
|
||||
}
|
||||
|
||||
amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
|
||||
bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
|
||||
AMDGPU_GPU_PAGE_SIZE);
|
||||
amdgpu_ras_reserve_page(adev, bps[i].retired_page);
|
||||
|
||||
memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
|
||||
data->count++;
|
||||
@@ -2674,10 +2718,167 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
|
||||
}
|
||||
}
|
||||
|
||||
int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, uint16_t pasid,
|
||||
pasid_notify pasid_fn, void *data, uint32_t reset)
|
||||
{
|
||||
int ret = 0;
|
||||
struct ras_poison_msg poison_msg;
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
memset(&poison_msg, 0, sizeof(poison_msg));
|
||||
poison_msg.block = block;
|
||||
poison_msg.pasid = pasid;
|
||||
poison_msg.reset = reset;
|
||||
poison_msg.pasid_fn = pasid_fn;
|
||||
poison_msg.data = data;
|
||||
|
||||
ret = kfifo_put(&con->poison_fifo, poison_msg);
|
||||
if (!ret) {
|
||||
dev_err(adev->dev, "Poison message fifo is full!\n");
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
|
||||
struct ras_poison_msg *poison_msg)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
return kfifo_get(&con->poison_fifo, poison_msg);
|
||||
}
|
||||
|
||||
static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
|
||||
{
|
||||
mutex_init(&ecc_log->lock);
|
||||
|
||||
/* Set any value as siphash key */
|
||||
memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));
|
||||
|
||||
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
|
||||
ecc_log->de_updated = false;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
|
||||
{
|
||||
struct radix_tree_iter iter;
|
||||
void __rcu **slot;
|
||||
struct ras_ecc_err *ecc_err;
|
||||
|
||||
mutex_lock(&ecc_log->lock);
|
||||
radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) {
|
||||
ecc_err = radix_tree_deref_slot(slot);
|
||||
kfree(ecc_err->err_pages.pfn);
|
||||
kfree(ecc_err);
|
||||
radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot);
|
||||
}
|
||||
mutex_unlock(&ecc_log->lock);
|
||||
|
||||
mutex_destroy(&ecc_log->lock);
|
||||
ecc_log->de_updated = false;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_do_page_retirement(struct work_struct *work)
|
||||
{
|
||||
struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
|
||||
page_retirement_dwork.work);
|
||||
struct amdgpu_device *adev = con->adev;
|
||||
struct ras_err_data err_data;
|
||||
|
||||
if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
|
||||
return;
|
||||
|
||||
amdgpu_ras_error_data_init(&err_data);
|
||||
|
||||
amdgpu_umc_handle_bad_pages(adev, &err_data);
|
||||
|
||||
amdgpu_ras_error_data_fini(&err_data);
|
||||
|
||||
mutex_lock(&con->umc_ecc_log.lock);
|
||||
if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
|
||||
UMC_ECC_NEW_DETECTED_TAG))
|
||||
schedule_delayed_work(&con->page_retirement_dwork,
|
||||
msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
|
||||
mutex_unlock(&con->umc_ecc_log.lock);
|
||||
}
|
||||
|
||||
static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
|
||||
{
|
||||
int ret = 0;
|
||||
struct ras_ecc_log_info *ecc_log;
|
||||
struct ras_query_if info;
|
||||
uint32_t timeout = timeout_ms;
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
|
||||
memset(&info, 0, sizeof(info));
|
||||
info.head.block = ras_block;
|
||||
|
||||
ecc_log = &ras->umc_ecc_log;
|
||||
ecc_log->de_updated = false;
|
||||
do {
|
||||
ret = amdgpu_ras_query_error_status(adev, &info);
|
||||
if (ret) {
|
||||
dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (timeout && !ecc_log->de_updated) {
|
||||
msleep(1);
|
||||
timeout--;
|
||||
}
|
||||
} while (timeout && !ecc_log->de_updated);
|
||||
|
||||
if (timeout_ms && !timeout) {
|
||||
dev_warn(adev->dev, "Can't find deferred error\n");
|
||||
return -ETIMEDOUT;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
|
||||
uint32_t timeout)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
int ret;
|
||||
|
||||
ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
|
||||
if (!ret)
|
||||
schedule_delayed_work(&con->page_retirement_dwork, 0);
|
||||
}
|
||||
|
||||
static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
|
||||
struct ras_poison_msg *poison_msg)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
uint32_t reset = poison_msg->reset;
|
||||
uint16_t pasid = poison_msg->pasid;
|
||||
|
||||
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
|
||||
|
||||
if (poison_msg->pasid_fn)
|
||||
poison_msg->pasid_fn(adev, pasid, poison_msg->data);
|
||||
|
||||
if (reset) {
|
||||
flush_delayed_work(&con->page_retirement_dwork);
|
||||
|
||||
con->gpu_reset_flags |= reset;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_ras_page_retirement_thread(void *param)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)param;
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_poison_msg poison_msg;
|
||||
enum amdgpu_ras_block ras_block;
|
||||
bool poison_creation_is_handled = false;
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
|
||||
@@ -2688,13 +2889,34 @@ static int amdgpu_ras_page_retirement_thread(void *param)
|
||||
if (kthread_should_stop())
|
||||
break;
|
||||
|
||||
dev_info(adev->dev, "Start processing page retirement. request:%d\n",
|
||||
atomic_read(&con->page_retirement_req_cnt));
|
||||
|
||||
atomic_dec(&con->page_retirement_req_cnt);
|
||||
|
||||
amdgpu_umc_bad_page_polling_timeout(adev,
|
||||
false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
|
||||
if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
|
||||
continue;
|
||||
|
||||
ras_block = poison_msg.block;
|
||||
|
||||
dev_info(adev->dev, "Start processing ras block %s(%d)\n",
|
||||
ras_block_str(ras_block), ras_block);
|
||||
|
||||
if (ras_block == AMDGPU_RAS_BLOCK__UMC) {
|
||||
amdgpu_ras_poison_creation_handler(adev,
|
||||
MAX_UMC_POISON_POLLING_TIME_ASYNC);
|
||||
poison_creation_is_handled = true;
|
||||
} else {
|
||||
/* poison_creation_is_handled:
|
||||
* false: no poison creation interrupt, but it has poison
|
||||
* consumption interrupt.
|
||||
* true: It has poison creation interrupt at the beginning,
|
||||
* but it has no poison creation interrupt later.
|
||||
*/
|
||||
amdgpu_ras_poison_creation_handler(adev,
|
||||
poison_creation_is_handled ?
|
||||
0 : MAX_UMC_POISON_POLLING_TIME_ASYNC);
|
||||
|
||||
amdgpu_ras_poison_consumption_handler(adev, &poison_msg);
|
||||
poison_creation_is_handled = false;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -2763,6 +2985,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
}
|
||||
}
|
||||
|
||||
mutex_init(&con->page_rsv_lock);
|
||||
INIT_KFIFO(con->poison_fifo);
|
||||
mutex_init(&con->page_retirement_lock);
|
||||
init_waitqueue_head(&con->page_retirement_wq);
|
||||
atomic_set(&con->page_retirement_req_cnt, 0);
|
||||
@@ -2773,6 +2997,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
|
||||
}
|
||||
|
||||
INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
|
||||
amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
|
||||
#ifdef CONFIG_X86_MCE_AMD
|
||||
if ((adev->asic_type == CHIP_ALDEBARAN) &&
|
||||
(adev->gmc.xgmi.connected_to_cpu))
|
||||
@@ -2813,8 +3039,14 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
|
||||
|
||||
atomic_set(&con->page_retirement_req_cnt, 0);
|
||||
|
||||
mutex_destroy(&con->page_rsv_lock);
|
||||
|
||||
cancel_work_sync(&con->recovery_work);
|
||||
|
||||
cancel_delayed_work_sync(&con->page_retirement_dwork);
|
||||
|
||||
amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
|
||||
|
||||
mutex_lock(&con->recovery_lock);
|
||||
con->eh_data = NULL;
|
||||
kfree(data->bps);
|
||||
@@ -3036,6 +3268,35 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
|
||||
AMDGPU_RAS_ERROR__PARITY;
|
||||
}
|
||||
|
||||
static void ras_event_mgr_init(struct ras_event_manager *mgr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++)
|
||||
atomic64_set(&mgr->seqnos[i], 0);
|
||||
}
|
||||
|
||||
static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_hive_info *hive;
|
||||
|
||||
if (!ras)
|
||||
return;
|
||||
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
|
||||
|
||||
/* init event manager with node 0 on xgmi system */
|
||||
if (!amdgpu_in_reset(adev)) {
|
||||
if (!hive || adev->gmc.xgmi.node_id == 0)
|
||||
ras_event_mgr_init(ras->event_mgr);
|
||||
}
|
||||
|
||||
if (hive)
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
}
|
||||
|
||||
int amdgpu_ras_init(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
@@ -3356,6 +3617,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
|
||||
if (amdgpu_sriov_vf(adev))
|
||||
return 0;
|
||||
|
||||
amdgpu_ras_event_mgr_init(adev);
|
||||
|
||||
if (amdgpu_aca_is_enabled(adev)) {
|
||||
if (amdgpu_in_reset(adev))
|
||||
r = amdgpu_aca_reset(adev);
|
||||
@@ -3472,14 +3735,39 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
|
||||
atomic_set(&ras->fed, !!status);
|
||||
}
|
||||
|
||||
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id)
|
||||
{
|
||||
return !(id & BIT_ULL(63));
|
||||
}
|
||||
|
||||
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)
|
||||
{
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
u64 id;
|
||||
|
||||
switch (type) {
|
||||
case RAS_EVENT_TYPE_ISR:
|
||||
id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]);
|
||||
break;
|
||||
case RAS_EVENT_TYPE_INVALID:
|
||||
default:
|
||||
id = BIT_ULL(63) | 0ULL;
|
||||
break;
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
|
||||
{
|
||||
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]);
|
||||
|
||||
dev_info(adev->dev, "uncorrectable hardware error"
|
||||
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
|
||||
RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
|
||||
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
|
||||
|
||||
amdgpu_ras_set_fed(adev, true);
|
||||
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
@@ -3998,6 +4286,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_a
|
||||
{
|
||||
struct ras_err_addr *mca_err_addr;
|
||||
|
||||
/* This function will be retired. */
|
||||
return;
|
||||
mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
|
||||
if (!mca_err_addr)
|
||||
return;
|
||||
@@ -4195,3 +4485,19 @@ void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
|
||||
amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);
|
||||
}
|
||||
}
|
||||
|
||||
int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
|
||||
uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&con->page_rsv_lock);
|
||||
ret = amdgpu_vram_mgr_query_page_status(mgr, start);
|
||||
if (ret == -ENOENT)
|
||||
ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE);
|
||||
mutex_unlock(&con->page_rsv_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -26,6 +26,9 @@
|
||||
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/kfifo.h>
|
||||
#include <linux/radix-tree.h>
|
||||
#include <linux/siphash.h>
|
||||
#include "ta_ras_if.h"
|
||||
#include "amdgpu_ras_eeprom.h"
|
||||
#include "amdgpu_smuio.h"
|
||||
@@ -64,6 +67,14 @@ struct amdgpu_iv_entry;
|
||||
/* The high three bits indicates socketid */
|
||||
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
|
||||
|
||||
#define RAS_EVENT_LOG(_adev, _id, _fmt, ...) \
|
||||
do { \
|
||||
if (amdgpu_ras_event_id_is_valid((_adev), (_id))) \
|
||||
dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__); \
|
||||
else \
|
||||
dev_info((_adev)->dev, _fmt, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
enum amdgpu_ras_block {
|
||||
AMDGPU_RAS_BLOCK__UMC = 0,
|
||||
AMDGPU_RAS_BLOCK__SDMA,
|
||||
@@ -419,6 +430,52 @@ struct umc_ecc_info {
|
||||
int record_ce_addr_supported;
|
||||
};
|
||||
|
||||
enum ras_event_type {
|
||||
RAS_EVENT_TYPE_INVALID = -1,
|
||||
RAS_EVENT_TYPE_ISR = 0,
|
||||
RAS_EVENT_TYPE_COUNT,
|
||||
};
|
||||
|
||||
struct ras_event_manager {
|
||||
atomic64_t seqnos[RAS_EVENT_TYPE_COUNT];
|
||||
};
|
||||
|
||||
struct ras_query_context {
|
||||
enum ras_event_type type;
|
||||
u64 event_id;
|
||||
};
|
||||
|
||||
typedef int (*pasid_notify)(struct amdgpu_device *adev,
|
||||
uint16_t pasid, void *data);
|
||||
|
||||
struct ras_poison_msg {
|
||||
enum amdgpu_ras_block block;
|
||||
uint16_t pasid;
|
||||
uint32_t reset;
|
||||
pasid_notify pasid_fn;
|
||||
void *data;
|
||||
};
|
||||
|
||||
struct ras_err_pages {
|
||||
uint32_t count;
|
||||
uint64_t *pfn;
|
||||
};
|
||||
|
||||
struct ras_ecc_err {
|
||||
u64 hash_index;
|
||||
uint64_t status;
|
||||
uint64_t ipid;
|
||||
uint64_t addr;
|
||||
struct ras_err_pages err_pages;
|
||||
};
|
||||
|
||||
struct ras_ecc_log_info {
|
||||
struct mutex lock;
|
||||
siphash_key_t ecc_key;
|
||||
struct radix_tree_root de_page_tree;
|
||||
bool de_updated;
|
||||
};
|
||||
|
||||
struct amdgpu_ras {
|
||||
/* ras infrastructure */
|
||||
/* for ras itself. */
|
||||
@@ -477,8 +534,18 @@ struct amdgpu_ras {
|
||||
wait_queue_head_t page_retirement_wq;
|
||||
struct mutex page_retirement_lock;
|
||||
atomic_t page_retirement_req_cnt;
|
||||
struct mutex page_rsv_lock;
|
||||
DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
|
||||
struct ras_ecc_log_info umc_ecc_log;
|
||||
struct delayed_work page_retirement_dwork;
|
||||
|
||||
/* Fatal error detected flag */
|
||||
atomic_t fed;
|
||||
|
||||
/* RAS event manager */
|
||||
struct ras_event_manager __event_mgr;
|
||||
struct ras_event_manager *event_mgr;
|
||||
|
||||
};
|
||||
|
||||
struct ras_fs_data {
|
||||
@@ -512,6 +579,7 @@ struct ras_err_data {
|
||||
unsigned long de_count;
|
||||
unsigned long err_addr_cnt;
|
||||
struct eeprom_table_record *err_addr;
|
||||
unsigned long err_addr_len;
|
||||
u32 err_list_count;
|
||||
struct list_head err_node_list;
|
||||
};
|
||||
@@ -879,4 +947,13 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
|
||||
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
|
||||
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
|
||||
|
||||
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
|
||||
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
|
||||
|
||||
int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);
|
||||
|
||||
int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, uint16_t pasid,
|
||||
pasid_notify pasid_fn, void *data, uint32_t reset);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -404,6 +404,22 @@ static int amdgpu_ras_eeprom_correct_header_tag(
|
||||
return res;
|
||||
}
|
||||
|
||||
static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control)
|
||||
{
|
||||
struct amdgpu_device *adev = to_amdgpu_device(control);
|
||||
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
|
||||
|
||||
switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
|
||||
case IP_VERSION(8, 10, 0):
|
||||
case IP_VERSION(12, 0, 0):
|
||||
hdr->version = RAS_TABLE_VER_V2_1;
|
||||
return;
|
||||
default:
|
||||
hdr->version = RAS_TABLE_VER_V1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
|
||||
* @control: pointer to control structure
|
||||
@@ -423,11 +439,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
|
||||
mutex_lock(&control->ras_tbl_mutex);
|
||||
|
||||
hdr->header = RAS_TABLE_HDR_VAL;
|
||||
if (adev->umc.ras &&
|
||||
adev->umc.ras->set_eeprom_table_version)
|
||||
adev->umc.ras->set_eeprom_table_version(hdr);
|
||||
else
|
||||
hdr->version = RAS_TABLE_VER_V1;
|
||||
amdgpu_ras_set_eeprom_table_version(control);
|
||||
|
||||
if (hdr->version == RAS_TABLE_VER_V2_1) {
|
||||
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
|
||||
|
||||
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_res_cleared - check if blocks are cleared
|
||||
*
|
||||
* @cur: the cursor to extract the block
|
||||
*
|
||||
* Check if the @cur block is cleared
|
||||
*/
|
||||
static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
|
||||
{
|
||||
struct drm_buddy_block *block;
|
||||
|
||||
switch (cur->mem_type) {
|
||||
case TTM_PL_VRAM:
|
||||
block = cur->node;
|
||||
|
||||
if (!amdgpu_vram_mgr_is_cleared(block))
|
||||
return false;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -21,9 +21,6 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/devcoredump.h>
|
||||
#include <generated/utsrelease.h>
|
||||
|
||||
#include "amdgpu_reset.h"
|
||||
#include "aldebaran.h"
|
||||
#include "sienna_cichlid.h"
|
||||
@@ -161,105 +158,3 @@ void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
|
||||
atomic_set(&reset_domain->in_gpu_reset, 0);
|
||||
up_write(&reset_domain->sem);
|
||||
}
|
||||
|
||||
#ifndef CONFIG_DEV_COREDUMP
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
}
|
||||
#else
|
||||
static ssize_t
|
||||
amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
|
||||
void *data, size_t datalen)
|
||||
{
|
||||
struct drm_printer p;
|
||||
struct amdgpu_coredump_info *coredump = data;
|
||||
struct drm_print_iterator iter;
|
||||
int i;
|
||||
|
||||
iter.data = buffer;
|
||||
iter.offset = 0;
|
||||
iter.start = offset;
|
||||
iter.remain = count;
|
||||
|
||||
p = drm_coredump_printer(&iter);
|
||||
|
||||
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
|
||||
drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
|
||||
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
|
||||
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
|
||||
drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
|
||||
coredump->reset_time.tv_nsec);
|
||||
|
||||
if (coredump->reset_task_info.pid)
|
||||
drm_printf(&p, "process_name: %s PID: %d\n",
|
||||
coredump->reset_task_info.process_name,
|
||||
coredump->reset_task_info.pid);
|
||||
|
||||
if (coredump->ring) {
|
||||
drm_printf(&p, "\nRing timed out details\n");
|
||||
drm_printf(&p, "IP Type: %d Ring Name: %s\n",
|
||||
coredump->ring->funcs->type,
|
||||
coredump->ring->name);
|
||||
}
|
||||
|
||||
if (coredump->reset_vram_lost)
|
||||
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
|
||||
if (coredump->adev->reset_info.num_regs) {
|
||||
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
|
||||
|
||||
for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
|
||||
drm_printf(&p, "0x%08x: 0x%08x\n",
|
||||
coredump->adev->reset_info.reset_dump_reg_list[i],
|
||||
coredump->adev->reset_info.reset_dump_reg_value[i]);
|
||||
}
|
||||
|
||||
return count - iter.remain;
|
||||
}
|
||||
|
||||
static void amdgpu_devcoredump_free(void *data)
|
||||
{
|
||||
kfree(data);
|
||||
}
|
||||
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
struct amdgpu_coredump_info *coredump;
|
||||
struct drm_device *dev = adev_to_drm(adev);
|
||||
struct amdgpu_job *job = reset_context->job;
|
||||
struct drm_sched_job *s_job;
|
||||
|
||||
coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
|
||||
|
||||
if (!coredump) {
|
||||
DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
coredump->reset_vram_lost = vram_lost;
|
||||
|
||||
if (reset_context->job && reset_context->job->vm) {
|
||||
struct amdgpu_task_info *ti;
|
||||
struct amdgpu_vm *vm = reset_context->job->vm;
|
||||
|
||||
ti = amdgpu_vm_get_task_info_vm(vm);
|
||||
if (ti) {
|
||||
coredump->reset_task_info = *ti;
|
||||
amdgpu_vm_put_task_info(ti);
|
||||
}
|
||||
}
|
||||
|
||||
if (job) {
|
||||
s_job = &job->base;
|
||||
coredump->ring = to_amdgpu_ring(s_job->sched);
|
||||
}
|
||||
|
||||
coredump->adev = adev;
|
||||
|
||||
ktime_get_ts64(&coredump->reset_time);
|
||||
|
||||
dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
|
||||
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -32,6 +32,7 @@ enum AMDGPU_RESET_FLAGS {
|
||||
|
||||
AMDGPU_NEED_FULL_RESET = 0,
|
||||
AMDGPU_SKIP_HW_RESET = 1,
|
||||
AMDGPU_SKIP_COREDUMP = 2,
|
||||
};
|
||||
|
||||
struct amdgpu_reset_context {
|
||||
@@ -88,19 +89,6 @@ struct amdgpu_reset_domain {
|
||||
atomic_t reset_res;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_DEV_COREDUMP
|
||||
|
||||
#define AMDGPU_COREDUMP_VERSION "1"
|
||||
|
||||
struct amdgpu_coredump_info {
|
||||
struct amdgpu_device *adev;
|
||||
struct amdgpu_task_info reset_task_info;
|
||||
struct timespec64 reset_time;
|
||||
bool reset_vram_lost;
|
||||
struct amdgpu_ring *ring;
|
||||
};
|
||||
#endif
|
||||
|
||||
int amdgpu_reset_init(struct amdgpu_device *adev);
|
||||
int amdgpu_reset_fini(struct amdgpu_device *adev);
|
||||
|
||||
@@ -141,9 +129,6 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
|
||||
|
||||
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
|
||||
|
||||
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
||||
struct amdgpu_reset_context *reset_context);
|
||||
|
||||
#define for_each_handler(i, handler, reset_ctl) \
|
||||
for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) && \
|
||||
(handler = (*reset_ctl->reset_handlers)[i]); \
|
||||
|
||||
@@ -132,7 +132,7 @@ struct amdgpu_buffer_funcs {
|
||||
uint64_t dst_offset,
|
||||
/* number of byte to transfer */
|
||||
uint32_t byte_count,
|
||||
bool tmz);
|
||||
uint32_t copy_flags);
|
||||
|
||||
/* maximum bytes in a single operation */
|
||||
uint32_t fill_max_bytes;
|
||||
|
||||
@@ -44,6 +44,7 @@ struct amdgpu_smuio_funcs {
|
||||
u32 (*get_socket_id)(struct amdgpu_device *adev);
|
||||
enum amdgpu_pkg_type (*get_pkg_type)(struct amdgpu_device *adev);
|
||||
bool (*is_host_gpu_xgmi_supported)(struct amdgpu_device *adev);
|
||||
u64 (*get_gpu_clock_counter)(struct amdgpu_device *adev);
|
||||
};
|
||||
|
||||
struct amdgpu_smuio {
|
||||
|
||||
@@ -236,7 +236,7 @@ static int amdgpu_ttm_map_buffer(struct ttm_buffer_object *bo,
|
||||
dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
|
||||
dst_addr += window * AMDGPU_GTT_MAX_TRANSFER_SIZE * 8;
|
||||
amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr,
|
||||
dst_addr, num_bytes, false);
|
||||
dst_addr, num_bytes, 0);
|
||||
|
||||
amdgpu_ring_pad_ib(ring, &job->ibs[0]);
|
||||
WARN_ON(job->ibs[0].length_dw > num_dw);
|
||||
@@ -296,6 +296,8 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
|
||||
struct dma_fence *fence = NULL;
|
||||
int r = 0;
|
||||
|
||||
uint32_t copy_flags = 0;
|
||||
|
||||
if (!adev->mman.buffer_funcs_enabled) {
|
||||
DRM_ERROR("Trying to move memory with ring turned off.\n");
|
||||
return -EINVAL;
|
||||
@@ -323,8 +325,11 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
|
||||
if (r)
|
||||
goto error;
|
||||
|
||||
r = amdgpu_copy_buffer(ring, from, to, cur_size,
|
||||
resv, &next, false, true, tmz);
|
||||
if (tmz)
|
||||
copy_flags |= AMDGPU_COPY_FLAGS_TMZ;
|
||||
|
||||
r = amdgpu_copy_buffer(ring, from, to, cur_size, resv,
|
||||
&next, false, true, copy_flags);
|
||||
if (r)
|
||||
goto error;
|
||||
|
||||
@@ -378,11 +383,12 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo,
|
||||
(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
|
||||
struct dma_fence *wipe_fence = NULL;
|
||||
|
||||
r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, &wipe_fence,
|
||||
false);
|
||||
r = amdgpu_fill_buffer(abo, 0, NULL, &wipe_fence,
|
||||
false);
|
||||
if (r) {
|
||||
goto error;
|
||||
} else if (wipe_fence) {
|
||||
amdgpu_vram_mgr_set_cleared(bo->resource);
|
||||
dma_fence_put(fence);
|
||||
fence = wipe_fence;
|
||||
}
|
||||
@@ -1492,7 +1498,7 @@ static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object *bo,
|
||||
swap(src_addr, dst_addr);
|
||||
|
||||
amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr,
|
||||
PAGE_SIZE, false);
|
||||
PAGE_SIZE, 0);
|
||||
|
||||
amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job->ibs[0]);
|
||||
WARN_ON(job->ibs[0].length_dw > num_dw);
|
||||
@@ -2143,7 +2149,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
|
||||
uint64_t dst_offset, uint32_t byte_count,
|
||||
struct dma_resv *resv,
|
||||
struct dma_fence **fence, bool direct_submit,
|
||||
bool vm_needs_flush, bool tmz)
|
||||
bool vm_needs_flush, uint32_t copy_flags)
|
||||
{
|
||||
struct amdgpu_device *adev = ring->adev;
|
||||
unsigned int num_loops, num_dw;
|
||||
@@ -2169,8 +2175,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
|
||||
uint32_t cur_size_in_bytes = min(byte_count, max_bytes);
|
||||
|
||||
amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_offset,
|
||||
dst_offset, cur_size_in_bytes, tmz);
|
||||
|
||||
dst_offset, cur_size_in_bytes, copy_flags);
|
||||
src_offset += cur_size_in_bytes;
|
||||
dst_offset += cur_size_in_bytes;
|
||||
byte_count -= cur_size_in_bytes;
|
||||
@@ -2230,6 +2235,71 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, uint32_t src_data,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_ttm_clear_buffer - clear memory buffers
|
||||
* @bo: amdgpu buffer object
|
||||
* @resv: reservation object
|
||||
* @fence: dma_fence associated with the operation
|
||||
*
|
||||
* Clear the memory buffer resource.
|
||||
*
|
||||
* Returns:
|
||||
* 0 for success or a negative error code on failure.
|
||||
*/
|
||||
int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo,
|
||||
struct dma_resv *resv,
|
||||
struct dma_fence **fence)
|
||||
{
|
||||
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
|
||||
struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
|
||||
struct amdgpu_res_cursor cursor;
|
||||
u64 addr;
|
||||
int r;
|
||||
|
||||
if (!adev->mman.buffer_funcs_enabled)
|
||||
return -EINVAL;
|
||||
|
||||
if (!fence)
|
||||
return -EINVAL;
|
||||
|
||||
*fence = dma_fence_get_stub();
|
||||
|
||||
amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor);
|
||||
|
||||
mutex_lock(&adev->mman.gtt_window_lock);
|
||||
while (cursor.remaining) {
|
||||
struct dma_fence *next = NULL;
|
||||
u64 size;
|
||||
|
||||
if (amdgpu_res_cleared(&cursor)) {
|
||||
amdgpu_res_next(&cursor, cursor.size);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Never clear more than 256MiB at once to avoid timeouts */
|
||||
size = min(cursor.size, 256ULL << 20);
|
||||
|
||||
r = amdgpu_ttm_map_buffer(&bo->tbo, bo->tbo.resource, &cursor,
|
||||
1, ring, false, &size, &addr);
|
||||
if (r)
|
||||
goto err;
|
||||
|
||||
r = amdgpu_ttm_fill_mem(ring, 0, addr, size, resv,
|
||||
&next, true, true);
|
||||
if (r)
|
||||
goto err;
|
||||
|
||||
dma_fence_put(*fence);
|
||||
*fence = next;
|
||||
|
||||
amdgpu_res_next(&cursor, size);
|
||||
}
|
||||
err:
|
||||
mutex_unlock(&adev->mman.gtt_window_lock);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int amdgpu_fill_buffer(struct amdgpu_bo *bo,
|
||||
uint32_t src_data,
|
||||
struct dma_resv *resv,
|
||||
|
||||
@@ -38,8 +38,6 @@
|
||||
#define AMDGPU_GTT_MAX_TRANSFER_SIZE 512
|
||||
#define AMDGPU_GTT_NUM_TRANSFER_WINDOWS 2
|
||||
|
||||
#define AMDGPU_POISON 0xd0bed0be
|
||||
|
||||
extern const struct attribute_group amdgpu_vram_mgr_attr_group;
|
||||
extern const struct attribute_group amdgpu_gtt_mgr_attr_group;
|
||||
|
||||
@@ -111,6 +109,8 @@ struct amdgpu_copy_mem {
|
||||
unsigned long offset;
|
||||
};
|
||||
|
||||
#define AMDGPU_COPY_FLAGS_TMZ (1 << 0)
|
||||
|
||||
int amdgpu_gtt_mgr_init(struct amdgpu_device *adev, uint64_t gtt_size);
|
||||
void amdgpu_gtt_mgr_fini(struct amdgpu_device *adev);
|
||||
int amdgpu_preempt_mgr_init(struct amdgpu_device *adev);
|
||||
@@ -151,13 +151,16 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
|
||||
uint64_t dst_offset, uint32_t byte_count,
|
||||
struct dma_resv *resv,
|
||||
struct dma_fence **fence, bool direct_submit,
|
||||
bool vm_needs_flush, bool tmz);
|
||||
bool vm_needs_flush, uint32_t copy_flags);
|
||||
int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
|
||||
const struct amdgpu_copy_mem *src,
|
||||
const struct amdgpu_copy_mem *dst,
|
||||
uint64_t size, bool tmz,
|
||||
struct dma_resv *resv,
|
||||
struct dma_fence **f);
|
||||
int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo,
|
||||
struct dma_resv *resv,
|
||||
struct dma_fence **fence);
|
||||
int amdgpu_fill_buffer(struct amdgpu_bo *bo,
|
||||
uint32_t src_data,
|
||||
struct dma_resv *resv,
|
||||
|
||||
@@ -125,6 +125,7 @@ enum psp_fw_type {
|
||||
PSP_FW_TYPE_PSP_INTF_DRV,
|
||||
PSP_FW_TYPE_PSP_DBG_DRV,
|
||||
PSP_FW_TYPE_PSP_RAS_DRV,
|
||||
PSP_FW_TYPE_PSP_IPKEYMGR_DRV,
|
||||
PSP_FW_TYPE_MAX_INDEX,
|
||||
};
|
||||
|
||||
|
||||
@@ -21,10 +21,13 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/sort.h>
|
||||
#include "amdgpu.h"
|
||||
#include "umc_v6_7.h"
|
||||
#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
|
||||
|
||||
#define MAX_UMC_HASH_STRING_SIZE 256
|
||||
|
||||
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
|
||||
struct ras_err_data *err_data, uint64_t err_addr,
|
||||
uint32_t ch_inst, uint32_t umc_inst)
|
||||
@@ -63,6 +66,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
|
||||
goto out_fini_err_data;
|
||||
}
|
||||
|
||||
err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query;
|
||||
|
||||
/*
|
||||
* Translate UMC channel address to Physical address
|
||||
*/
|
||||
@@ -86,7 +91,7 @@ out_fini_err_data:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
void *ras_error_status)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
@@ -118,6 +123,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
if(!err_data->err_addr)
|
||||
dev_warn(adev->dev, "Failed to alloc memory for "
|
||||
"umc error address record!\n");
|
||||
else
|
||||
err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
|
||||
|
||||
/* umc query_ras_error_address is also responsible for clearing
|
||||
* error status
|
||||
@@ -143,6 +150,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
if(!err_data->err_addr)
|
||||
dev_warn(adev->dev, "Failed to alloc memory for "
|
||||
"umc error address record!\n");
|
||||
else
|
||||
err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
|
||||
|
||||
/* umc query_ras_error_address is also responsible for clearing
|
||||
* error status
|
||||
@@ -170,6 +179,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
}
|
||||
|
||||
kfree(err_data->err_addr);
|
||||
err_data->err_addr = NULL;
|
||||
|
||||
mutex_unlock(&con->page_retirement_lock);
|
||||
}
|
||||
@@ -177,7 +187,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry,
|
||||
bool reset)
|
||||
uint32_t reset)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
@@ -186,9 +196,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
|
||||
|
||||
if (err_data->ue_count && reset) {
|
||||
/* use mode-2 reset for poison consumption */
|
||||
if (!entry)
|
||||
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
con->gpu_reset_flags |= reset;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
@@ -196,7 +204,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
|
||||
}
|
||||
|
||||
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
bool reset, uint32_t timeout_ms)
|
||||
uint32_t reset, uint32_t timeout_ms)
|
||||
{
|
||||
struct ras_err_data err_data;
|
||||
struct ras_common_if head = {
|
||||
@@ -238,16 +246,16 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
if (reset) {
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
/* use mode-2 reset for poison consumption */
|
||||
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
|
||||
con->gpu_reset_flags |= reset;
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset)
|
||||
int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, uint16_t pasid,
|
||||
pasid_notify pasid_fn, void *data, uint32_t reset)
|
||||
{
|
||||
int ret = AMDGPU_RAS_SUCCESS;
|
||||
|
||||
@@ -285,16 +293,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
|
||||
amdgpu_ras_error_data_fini(&err_data);
|
||||
} else {
|
||||
if (reset) {
|
||||
amdgpu_umc_bad_page_polling_timeout(adev,
|
||||
reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
|
||||
} else {
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
|
||||
amdgpu_ras_put_poison_req(adev,
|
||||
block, pasid, pasid_fn, data, reset);
|
||||
|
||||
atomic_inc(&con->page_retirement_req_cnt);
|
||||
|
||||
wake_up(&con->page_retirement_wq);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
|
||||
@@ -307,11 +313,19 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, uint32_t reset)
|
||||
{
|
||||
return amdgpu_umc_pasid_poison_handler(adev,
|
||||
block, 0, NULL, NULL, reset);
|
||||
}
|
||||
|
||||
int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
void *ras_error_status,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
{
|
||||
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
|
||||
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
|
||||
AMDGPU_RAS_GPU_RESET_MODE1_RESET);
|
||||
}
|
||||
|
||||
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
|
||||
@@ -388,14 +402,20 @@ int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
|
||||
int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
|
||||
uint64_t err_addr,
|
||||
uint64_t retired_page,
|
||||
uint32_t channel_index,
|
||||
uint32_t umc_inst)
|
||||
{
|
||||
struct eeprom_table_record *err_rec =
|
||||
&err_data->err_addr[err_data->err_addr_cnt];
|
||||
struct eeprom_table_record *err_rec;
|
||||
|
||||
if (!err_data ||
|
||||
!err_data->err_addr ||
|
||||
(err_data->err_addr_cnt >= err_data->err_addr_len))
|
||||
return -EINVAL;
|
||||
|
||||
err_rec = &err_data->err_addr[err_data->err_addr_cnt];
|
||||
|
||||
err_rec->address = err_addr;
|
||||
/* page frame address is saved */
|
||||
@@ -407,6 +427,8 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
|
||||
err_rec->mcumc_id = umc_inst;
|
||||
|
||||
err_data->err_addr_cnt++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
|
||||
@@ -439,3 +461,76 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
|
||||
uint64_t status, uint64_t ipid, uint64_t addr)
|
||||
{
|
||||
if (adev->umc.ras->update_ecc_status)
|
||||
return adev->umc.ras->update_ecc_status(adev,
|
||||
status, ipid, addr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_umc_uint64_cmp(const void *a, const void *b)
|
||||
{
|
||||
uint64_t *addr_a = (uint64_t *)a;
|
||||
uint64_t *addr_b = (uint64_t *)b;
|
||||
|
||||
if (*addr_a > *addr_b)
|
||||
return 1;
|
||||
else if (*addr_a < *addr_b)
|
||||
return -1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Use string hash to avoid logging the same bad pages repeatedly */
|
||||
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
|
||||
uint64_t *pfns, int len, uint64_t *val)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
char buf[MAX_UMC_HASH_STRING_SIZE] = {0};
|
||||
int offset = 0, i = 0;
|
||||
uint64_t hash_val;
|
||||
|
||||
if (!pfns || !len)
|
||||
return -EINVAL;
|
||||
|
||||
sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL);
|
||||
|
||||
for (i = 0; i < len; i++)
|
||||
offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]);
|
||||
|
||||
hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key);
|
||||
|
||||
*val = hash_val;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
|
||||
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_ecc_log_info *ecc_log;
|
||||
int ret;
|
||||
|
||||
ecc_log = &con->umc_ecc_log;
|
||||
|
||||
mutex_lock(&ecc_log->lock);
|
||||
ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
|
||||
if (!ret) {
|
||||
struct ras_err_pages *err_pages = &ecc_err->err_pages;
|
||||
int i;
|
||||
|
||||
/* Reserve memory */
|
||||
for (i = 0; i < err_pages->count; i++)
|
||||
amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
|
||||
|
||||
radix_tree_tag_set(ecc_tree,
|
||||
ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
|
||||
}
|
||||
mutex_unlock(&ecc_log->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -52,6 +52,8 @@
|
||||
#define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \
|
||||
LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
|
||||
|
||||
/* Page retirement tag */
|
||||
#define UMC_ECC_NEW_DETECTED_TAG 0x1
|
||||
|
||||
typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
|
||||
uint32_t umc_inst, uint32_t ch_inst, void *data);
|
||||
@@ -66,8 +68,8 @@ struct amdgpu_umc_ras {
|
||||
void *ras_error_status);
|
||||
bool (*check_ecc_err_status)(struct amdgpu_device *adev,
|
||||
enum amdgpu_mca_error_type type, void *ras_error_status);
|
||||
/* support different eeprom table version for different asic */
|
||||
void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr);
|
||||
int (*update_ecc_status)(struct amdgpu_device *adev,
|
||||
uint64_t status, uint64_t ipid, uint64_t addr);
|
||||
};
|
||||
|
||||
struct amdgpu_umc_funcs {
|
||||
@@ -103,11 +105,14 @@ struct amdgpu_umc {
|
||||
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
|
||||
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
|
||||
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, bool reset);
|
||||
enum amdgpu_ras_block block, uint32_t reset);
|
||||
int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
|
||||
enum amdgpu_ras_block block, uint16_t pasid,
|
||||
pasid_notify pasid_fn, void *data, uint32_t reset);
|
||||
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *source,
|
||||
struct amdgpu_iv_entry *entry);
|
||||
void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
|
||||
int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
|
||||
uint64_t err_addr,
|
||||
uint64_t retired_page,
|
||||
uint32_t channel_index,
|
||||
@@ -123,5 +128,15 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
|
||||
umc_func func, void *data);
|
||||
|
||||
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
bool reset, uint32_t timeout_ms);
|
||||
uint32_t reset, uint32_t timeout_ms);
|
||||
|
||||
int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
|
||||
uint64_t status, uint64_t ipid, uint64_t addr);
|
||||
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
|
||||
uint64_t *pfns, int len, uint64_t *val);
|
||||
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
|
||||
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
|
||||
|
||||
void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
|
||||
void *ras_error_status);
|
||||
#endif
|
||||
|
||||
@@ -878,6 +878,8 @@ static const struct amd_ip_funcs umsch_mm_v4_0_ip_funcs = {
|
||||
.hw_fini = umsch_mm_hw_fini,
|
||||
.suspend = umsch_mm_suspend,
|
||||
.resume = umsch_mm_resume,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
const struct amdgpu_ip_block_version umsch_mm_v4_0_ip_block = {
|
||||
|
||||
@@ -743,7 +743,8 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p,
|
||||
uint32_t created = 0;
|
||||
uint32_t allocated = 0;
|
||||
uint32_t tmp, handle = 0;
|
||||
uint32_t *size = &tmp;
|
||||
uint32_t dummy = 0xffffffff;
|
||||
uint32_t *size = &dummy;
|
||||
unsigned int idx;
|
||||
int i, r = 0;
|
||||
|
||||
|
||||
@@ -93,7 +93,7 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work);
|
||||
|
||||
int amdgpu_vcn_early_init(struct amdgpu_device *adev)
|
||||
{
|
||||
char ucode_prefix[30];
|
||||
char ucode_prefix[25];
|
||||
char fw_name[40];
|
||||
int r, i;
|
||||
|
||||
@@ -185,7 +185,10 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
|
||||
if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP)
|
||||
bo_size += AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8);
|
||||
|
||||
if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0)) {
|
||||
if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(5, 0, 0)) {
|
||||
fw_shared_size = AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn5_fw_shared));
|
||||
log_offset = offsetof(struct amdgpu_vcn5_fw_shared, fw_log);
|
||||
} else if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0)) {
|
||||
fw_shared_size = AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn4_fw_shared));
|
||||
log_offset = offsetof(struct amdgpu_vcn4_fw_shared, fw_log);
|
||||
} else {
|
||||
|
||||
@@ -454,6 +454,16 @@ struct amdgpu_vcn_rb_metadata {
|
||||
uint8_t pad[26];
|
||||
};
|
||||
|
||||
struct amdgpu_vcn5_fw_shared {
|
||||
uint32_t present_flag_0;
|
||||
uint8_t pad[12];
|
||||
struct amdgpu_fw_shared_unified_queue_struct sq;
|
||||
uint8_t pad1[8];
|
||||
struct amdgpu_fw_shared_fw_logging fw_log;
|
||||
struct amdgpu_fw_shared_rb_setup rb_setup;
|
||||
uint8_t pad2[4];
|
||||
};
|
||||
|
||||
#define VCN_BLOCK_ENCODE_DISABLE_MASK 0x80
|
||||
#define VCN_BLOCK_DECODE_DISABLE_MASK 0x40
|
||||
#define VCN_BLOCK_QUEUE_DISABLE_MASK 0xC0
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_ras.h"
|
||||
#include "amdgpu_reset.h"
|
||||
#include "vi.h"
|
||||
#include "soc15.h"
|
||||
#include "nv.h"
|
||||
@@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
||||
return -EINVAL;
|
||||
|
||||
if (pf2vf_info->size > 1024) {
|
||||
DRM_ERROR("invalid pf2vf message size\n");
|
||||
dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
||||
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
|
||||
adev->virt.fw_reserve.checksum_key, checksum);
|
||||
if (checksum != checkval) {
|
||||
DRM_ERROR("invalid pf2vf message\n");
|
||||
dev_err(adev->dev,
|
||||
"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
|
||||
checksum, checkval);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
||||
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
|
||||
0, checksum);
|
||||
if (checksum != checkval) {
|
||||
DRM_ERROR("invalid pf2vf message\n");
|
||||
dev_err(adev->dev,
|
||||
"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
|
||||
checksum, checkval);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
|
||||
((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
|
||||
break;
|
||||
default:
|
||||
DRM_ERROR("invalid pf2vf version\n");
|
||||
dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -571,6 +576,11 @@ static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev)
|
||||
vf2pf_info->decode_usage = 0;
|
||||
|
||||
vf2pf_info->dummy_page_addr = (uint64_t)adev->dummy_page_addr;
|
||||
vf2pf_info->mes_info_addr = (uint64_t)adev->mes.resource_1_gpu_addr;
|
||||
|
||||
if (adev->mes.resource_1) {
|
||||
vf2pf_info->mes_info_size = adev->mes.resource_1->tbo.base.size;
|
||||
}
|
||||
vf2pf_info->checksum =
|
||||
amd_sriov_msg_checksum(
|
||||
vf2pf_info, vf2pf_info->header.size, 0, 0);
|
||||
@@ -584,8 +594,22 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
|
||||
int ret;
|
||||
|
||||
ret = amdgpu_virt_read_pf2vf_data(adev);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
adev->virt.vf2pf_update_retry_cnt++;
|
||||
if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
|
||||
amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
|
||||
amdgpu_ras_set_fed(adev, true);
|
||||
if (amdgpu_reset_domain_schedule(adev->reset_domain,
|
||||
&adev->virt.flr_work))
|
||||
return;
|
||||
else
|
||||
dev_err(adev->dev, "Failed to queue work! at %s", __func__);
|
||||
}
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
adev->virt.vf2pf_update_retry_cnt = 0;
|
||||
amdgpu_virt_write_vf2pf_data(adev);
|
||||
|
||||
out:
|
||||
@@ -606,6 +630,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
|
||||
adev->virt.fw_reserve.p_pf2vf = NULL;
|
||||
adev->virt.fw_reserve.p_vf2pf = NULL;
|
||||
adev->virt.vf2pf_update_interval_ms = 0;
|
||||
adev->virt.vf2pf_update_retry_cnt = 0;
|
||||
|
||||
if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
|
||||
DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!");
|
||||
@@ -705,12 +730,6 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev)
|
||||
adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE;
|
||||
}
|
||||
|
||||
if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
|
||||
/* VF MMIO access (except mailbox range) from CPU
|
||||
* will be blocked during sriov runtime
|
||||
*/
|
||||
adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
|
||||
|
||||
/* we have the ability to check now */
|
||||
if (amdgpu_sriov_vf(adev)) {
|
||||
switch (adev->asic_type) {
|
||||
|
||||
@@ -52,6 +52,8 @@
|
||||
/* tonga/fiji use this offset */
|
||||
#define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
|
||||
|
||||
#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5
|
||||
|
||||
enum amdgpu_sriov_vf_mode {
|
||||
SRIOV_VF_MODE_BARE_METAL = 0,
|
||||
SRIOV_VF_MODE_ONE_VF,
|
||||
@@ -130,6 +132,8 @@ enum AMDGIM_FEATURE_FLAG {
|
||||
AMDGIM_FEATURE_AV1_SUPPORT = (1 << 6),
|
||||
/* VCN RB decouple */
|
||||
AMDGIM_FEATURE_VCN_RB_DECOUPLE = (1 << 7),
|
||||
/* MES info */
|
||||
AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8),
|
||||
};
|
||||
|
||||
enum AMDGIM_REG_ACCESS_FLAG {
|
||||
@@ -257,6 +261,7 @@ struct amdgpu_virt {
|
||||
/* vf2pf message */
|
||||
struct delayed_work vf2pf_work;
|
||||
uint32_t vf2pf_update_interval_ms;
|
||||
int vf2pf_update_retry_cnt;
|
||||
|
||||
/* multimedia bandwidth config */
|
||||
bool is_mm_bw_enabled;
|
||||
@@ -332,6 +337,8 @@ static inline bool is_virtual_machine(void)
|
||||
((adev)->virt.gim_feature & AMDGIM_FEATURE_AV1_SUPPORT)
|
||||
#define amdgpu_sriov_is_vcn_rb_decouple(adev) \
|
||||
((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE)
|
||||
#define amdgpu_sriov_is_mes_info_enable(adev) \
|
||||
((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE)
|
||||
bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
|
||||
void amdgpu_virt_init_setting(struct amdgpu_device *adev);
|
||||
int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
|
||||
|
||||
@@ -658,6 +658,8 @@ static const struct amd_ip_funcs amdgpu_vkms_ip_funcs = {
|
||||
.soft_reset = amdgpu_vkms_soft_reset,
|
||||
.set_clockgating_state = amdgpu_vkms_set_clockgating_state,
|
||||
.set_powergating_state = amdgpu_vkms_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
const struct amdgpu_ip_block_version amdgpu_vkms_ip_block = {
|
||||
|
||||
@@ -885,6 +885,44 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
|
||||
kfree(tlb_cb);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_vm_tlb_flush - prepare TLB flush
|
||||
*
|
||||
* @params: parameters for update
|
||||
* @fence: input fence to sync TLB flush with
|
||||
* @tlb_cb: the callback structure
|
||||
*
|
||||
* Increments the tlb sequence to make sure that future CS execute a VM flush.
|
||||
*/
|
||||
static void
|
||||
amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,
|
||||
struct dma_fence **fence,
|
||||
struct amdgpu_vm_tlb_seq_struct *tlb_cb)
|
||||
{
|
||||
struct amdgpu_vm *vm = params->vm;
|
||||
|
||||
if (!fence || !*fence)
|
||||
return;
|
||||
|
||||
tlb_cb->vm = vm;
|
||||
if (!dma_fence_add_callback(*fence, &tlb_cb->cb,
|
||||
amdgpu_vm_tlb_seq_cb)) {
|
||||
dma_fence_put(vm->last_tlb_flush);
|
||||
vm->last_tlb_flush = dma_fence_get(*fence);
|
||||
} else {
|
||||
amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
|
||||
}
|
||||
|
||||
/* Prepare a TLB flush fence to be attached to PTs */
|
||||
if (!params->unlocked && vm->is_compute_context) {
|
||||
amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
|
||||
|
||||
/* Makes sure no PD/PT is freed before the flush */
|
||||
dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
|
||||
DMA_RESV_USAGE_BOOKKEEP);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_vm_update_range - update a range in the vm page table
|
||||
*
|
||||
@@ -916,8 +954,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
struct ttm_resource *res, dma_addr_t *pages_addr,
|
||||
struct dma_fence **fence)
|
||||
{
|
||||
struct amdgpu_vm_update_params params;
|
||||
struct amdgpu_vm_tlb_seq_struct *tlb_cb;
|
||||
struct amdgpu_vm_update_params params;
|
||||
struct amdgpu_res_cursor cursor;
|
||||
enum amdgpu_sync_mode sync_mode;
|
||||
int r, idx;
|
||||
@@ -927,8 +965,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
|
||||
tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
|
||||
if (!tlb_cb) {
|
||||
r = -ENOMEM;
|
||||
goto error_unlock;
|
||||
drm_dev_exit(idx);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,
|
||||
@@ -948,7 +986,9 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
params.immediate = immediate;
|
||||
params.pages_addr = pages_addr;
|
||||
params.unlocked = unlocked;
|
||||
params.needs_flush = flush_tlb;
|
||||
params.allow_override = allow_override;
|
||||
INIT_LIST_HEAD(¶ms.tlb_flush_waitlist);
|
||||
|
||||
/* Implicitly sync to command submissions in the same VM before
|
||||
* unmapping. Sync to moving fences before mapping.
|
||||
@@ -1031,24 +1071,18 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
}
|
||||
|
||||
r = vm->update_funcs->commit(¶ms, fence);
|
||||
if (r)
|
||||
goto error_free;
|
||||
|
||||
if (flush_tlb || params.table_freed) {
|
||||
tlb_cb->vm = vm;
|
||||
if (fence && *fence &&
|
||||
!dma_fence_add_callback(*fence, &tlb_cb->cb,
|
||||
amdgpu_vm_tlb_seq_cb)) {
|
||||
dma_fence_put(vm->last_tlb_flush);
|
||||
vm->last_tlb_flush = dma_fence_get(*fence);
|
||||
} else {
|
||||
amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
|
||||
}
|
||||
if (params.needs_flush) {
|
||||
amdgpu_vm_tlb_flush(¶ms, fence, tlb_cb);
|
||||
tlb_cb = NULL;
|
||||
}
|
||||
|
||||
amdgpu_vm_pt_free_list(adev, ¶ms);
|
||||
|
||||
error_free:
|
||||
kfree(tlb_cb);
|
||||
|
||||
error_unlock:
|
||||
amdgpu_vm_eviction_unlock(vm);
|
||||
drm_dev_exit(idx);
|
||||
return r;
|
||||
@@ -2411,6 +2445,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
|
||||
mutex_init(&vm->eviction_lock);
|
||||
vm->evicting = false;
|
||||
vm->tlb_fence_context = dma_fence_context_alloc(1);
|
||||
|
||||
r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
|
||||
false, &root, xcp_id);
|
||||
@@ -2944,6 +2979,14 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
|
||||
if (vm && status) {
|
||||
vm->fault_info.addr = addr;
|
||||
vm->fault_info.status = status;
|
||||
/*
|
||||
* Update the fault information globally for later usage
|
||||
* when vm could be stale or freed.
|
||||
*/
|
||||
adev->vm_manager.fault_info.addr = addr;
|
||||
adev->vm_manager.fault_info.vmhub = vmhub;
|
||||
adev->vm_manager.fault_info.status = status;
|
||||
|
||||
if (AMDGPU_IS_GFXHUB(vmhub)) {
|
||||
vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;
|
||||
vm->fault_info.vmhub |=
|
||||
|
||||
@@ -257,15 +257,20 @@ struct amdgpu_vm_update_params {
|
||||
unsigned int num_dw_left;
|
||||
|
||||
/**
|
||||
* @table_freed: return true if page table is freed when updating
|
||||
* @needs_flush: true whenever we need to invalidate the TLB
|
||||
*/
|
||||
bool table_freed;
|
||||
bool needs_flush;
|
||||
|
||||
/**
|
||||
* @allow_override: true for memory that is not uncached: allows MTYPE
|
||||
* to be overridden for NUMA local memory.
|
||||
*/
|
||||
bool allow_override;
|
||||
|
||||
/**
|
||||
* @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
|
||||
*/
|
||||
struct list_head tlb_flush_waitlist;
|
||||
};
|
||||
|
||||
struct amdgpu_vm_update_funcs {
|
||||
@@ -342,6 +347,7 @@ struct amdgpu_vm {
|
||||
atomic64_t tlb_seq;
|
||||
struct dma_fence *last_tlb_flush;
|
||||
atomic64_t kfd_last_flushed_seq;
|
||||
uint64_t tlb_fence_context;
|
||||
|
||||
/* How many times we had to re-generate the page tables */
|
||||
uint64_t generation;
|
||||
@@ -422,6 +428,8 @@ struct amdgpu_vm_manager {
|
||||
* look up VM of a page fault
|
||||
*/
|
||||
struct xarray pasids;
|
||||
/* Global registration of recent page fault information */
|
||||
struct amdgpu_vm_fault_info fault_info;
|
||||
};
|
||||
|
||||
struct amdgpu_bo_va_mapping;
|
||||
@@ -544,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
|
||||
uint64_t start, uint64_t end,
|
||||
uint64_t dst, uint64_t flags);
|
||||
void amdgpu_vm_pt_free_work(struct work_struct *work);
|
||||
void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
|
||||
struct amdgpu_vm_update_params *params);
|
||||
|
||||
#if defined(CONFIG_DEBUG_FS)
|
||||
void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m);
|
||||
@@ -609,5 +619,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
|
||||
uint64_t addr,
|
||||
uint32_t status,
|
||||
unsigned int vmhub);
|
||||
void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
|
||||
struct amdgpu_vm *vm,
|
||||
struct dma_fence **fence);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,
|
||||
static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
|
||||
struct dma_fence **fence)
|
||||
{
|
||||
/* Flush HDP */
|
||||
if (p->needs_flush)
|
||||
atomic64_inc(&p->vm->tlb_seq);
|
||||
|
||||
mb();
|
||||
amdgpu_device_flush_hdp(p->adev, NULL);
|
||||
return 0;
|
||||
|
||||
@@ -622,40 +622,58 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_vm_pt_free_dfs - free PD/PT levels
|
||||
* amdgpu_vm_pt_free_list - free PD/PT levels
|
||||
*
|
||||
* @adev: amdgpu device structure
|
||||
* @vm: amdgpu vm structure
|
||||
* @start: optional cursor where to start freeing PDs/PTs
|
||||
* @unlocked: vm resv unlock status
|
||||
* @params: see amdgpu_vm_update_params definition
|
||||
*
|
||||
* Free the page directory or page table level and all sub levels.
|
||||
* Free the page directory objects saved in the flush list
|
||||
*/
|
||||
static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
|
||||
struct amdgpu_vm *vm,
|
||||
struct amdgpu_vm_pt_cursor *start,
|
||||
bool unlocked)
|
||||
void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
|
||||
struct amdgpu_vm_update_params *params)
|
||||
{
|
||||
struct amdgpu_vm_pt_cursor cursor;
|
||||
struct amdgpu_vm_bo_base *entry;
|
||||
struct amdgpu_vm_bo_base *entry, *next;
|
||||
struct amdgpu_vm *vm = params->vm;
|
||||
bool unlocked = params->unlocked;
|
||||
|
||||
if (list_empty(¶ms->tlb_flush_waitlist))
|
||||
return;
|
||||
|
||||
if (unlocked) {
|
||||
spin_lock(&vm->status_lock);
|
||||
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
|
||||
list_move(&entry->vm_status, &vm->pt_freed);
|
||||
|
||||
if (start)
|
||||
list_move(&start->entry->vm_status, &vm->pt_freed);
|
||||
list_splice_init(¶ms->tlb_flush_waitlist, &vm->pt_freed);
|
||||
spin_unlock(&vm->status_lock);
|
||||
schedule_work(&vm->pt_free_work);
|
||||
return;
|
||||
}
|
||||
|
||||
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
|
||||
list_for_each_entry_safe(entry, next, ¶ms->tlb_flush_waitlist, vm_status)
|
||||
amdgpu_vm_pt_free(entry);
|
||||
}
|
||||
|
||||
if (start)
|
||||
amdgpu_vm_pt_free(start->entry);
|
||||
/**
|
||||
* amdgpu_vm_pt_add_list - add PD/PT level to the flush list
|
||||
*
|
||||
* @params: parameters for the update
|
||||
* @cursor: first PT entry to start DF search from, non NULL
|
||||
*
|
||||
* This list will be freed after TLB flush.
|
||||
*/
|
||||
static void amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params *params,
|
||||
struct amdgpu_vm_pt_cursor *cursor)
|
||||
{
|
||||
struct amdgpu_vm_pt_cursor seek;
|
||||
struct amdgpu_vm_bo_base *entry;
|
||||
|
||||
spin_lock(¶ms->vm->status_lock);
|
||||
for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, cursor, seek, entry) {
|
||||
if (entry && entry->bo)
|
||||
list_move(&entry->vm_status, ¶ms->tlb_flush_waitlist);
|
||||
}
|
||||
|
||||
/* enter start node now */
|
||||
list_move(&cursor->entry->vm_status, ¶ms->tlb_flush_waitlist);
|
||||
spin_unlock(¶ms->vm->status_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -667,7 +685,13 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
|
||||
*/
|
||||
void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm)
|
||||
{
|
||||
amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
|
||||
struct amdgpu_vm_pt_cursor cursor;
|
||||
struct amdgpu_vm_bo_base *entry;
|
||||
|
||||
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) {
|
||||
if (entry)
|
||||
amdgpu_vm_pt_free(entry);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -972,10 +996,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
|
||||
while (cursor.pfn < frag_start) {
|
||||
/* Make sure previous mapping is freed */
|
||||
if (cursor.entry->bo) {
|
||||
params->table_freed = true;
|
||||
amdgpu_vm_pt_free_dfs(adev, params->vm,
|
||||
&cursor,
|
||||
params->unlocked);
|
||||
params->needs_flush = true;
|
||||
amdgpu_vm_pt_add_list(params, &cursor);
|
||||
}
|
||||
amdgpu_vm_pt_next(adev, &cursor);
|
||||
}
|
||||
|
||||
@@ -126,6 +126,10 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
|
||||
|
||||
WARN_ON(ib->length_dw == 0);
|
||||
amdgpu_ring_pad_ib(ring, ib);
|
||||
|
||||
if (p->needs_flush)
|
||||
atomic64_inc(&p->vm->tlb_seq);
|
||||
|
||||
WARN_ON(ib->length_dw > p->num_dw_left);
|
||||
f = amdgpu_job_submit(p->job);
|
||||
|
||||
|
||||
112
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
Normal file
112
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
Normal file
@@ -0,0 +1,112 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||
/*
|
||||
* Copyright 2023 Advanced Micro Devices, Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <linux/dma-fence.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include "amdgpu.h"
|
||||
#include "amdgpu_vm.h"
|
||||
#include "amdgpu_gmc.h"
|
||||
|
||||
struct amdgpu_tlb_fence {
|
||||
struct dma_fence base;
|
||||
struct amdgpu_device *adev;
|
||||
struct dma_fence *dependency;
|
||||
struct work_struct work;
|
||||
spinlock_t lock;
|
||||
uint16_t pasid;
|
||||
|
||||
};
|
||||
|
||||
static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence *fence)
|
||||
{
|
||||
return "amdgpu tlb fence";
|
||||
}
|
||||
|
||||
static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f)
|
||||
{
|
||||
return "amdgpu tlb timeline";
|
||||
}
|
||||
|
||||
static void amdgpu_tlb_fence_work(struct work_struct *work)
|
||||
{
|
||||
struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
|
||||
int r;
|
||||
|
||||
if (f->dependency) {
|
||||
dma_fence_wait(f->dependency, false);
|
||||
dma_fence_put(f->dependency);
|
||||
f->dependency = NULL;
|
||||
}
|
||||
|
||||
r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
|
||||
if (r) {
|
||||
dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n",
|
||||
f->pasid);
|
||||
dma_fence_set_error(&f->base, r);
|
||||
}
|
||||
|
||||
dma_fence_signal(&f->base);
|
||||
dma_fence_put(&f->base);
|
||||
}
|
||||
|
||||
static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
|
||||
.use_64bit_seqno = true,
|
||||
.get_driver_name = amdgpu_tlb_fence_get_driver_name,
|
||||
.get_timeline_name = amdgpu_tlb_fence_get_timeline_name
|
||||
};
|
||||
|
||||
void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||
struct dma_fence **fence)
|
||||
{
|
||||
struct amdgpu_tlb_fence *f;
|
||||
|
||||
f = kmalloc(sizeof(*f), GFP_KERNEL);
|
||||
if (!f) {
|
||||
/*
|
||||
* We can't fail since the PDEs and PTEs are already updated, so
|
||||
* just block for the dependency and execute the TLB flush
|
||||
*/
|
||||
if (*fence)
|
||||
dma_fence_wait(*fence, false);
|
||||
|
||||
amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0);
|
||||
*fence = dma_fence_get_stub();
|
||||
return;
|
||||
}
|
||||
|
||||
f->adev = adev;
|
||||
f->dependency = *fence;
|
||||
f->pasid = vm->pasid;
|
||||
INIT_WORK(&f->work, amdgpu_tlb_fence_work);
|
||||
spin_lock_init(&f->lock);
|
||||
|
||||
dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
|
||||
vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
|
||||
|
||||
/* TODO: We probably need a separate wq here */
|
||||
dma_fence_get(&f->base);
|
||||
schedule_work(&f->work);
|
||||
|
||||
*fence = &f->base;
|
||||
}
|
||||
@@ -450,6 +450,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
|
||||
{
|
||||
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
|
||||
struct amdgpu_device *adev = to_amdgpu_device(mgr);
|
||||
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
|
||||
u64 vis_usage = 0, max_bytes, min_block_size;
|
||||
struct amdgpu_vram_mgr_resource *vres;
|
||||
u64 size, remaining_size, lpfn, fpfn;
|
||||
@@ -468,7 +469,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
|
||||
if (tbo->type != ttm_bo_type_kernel)
|
||||
max_bytes -= AMDGPU_VM_RESERVED_VRAM;
|
||||
|
||||
if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
|
||||
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
|
||||
pages_per_block = ~0ul;
|
||||
} else {
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
@@ -477,7 +478,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
|
||||
/* default to 2MB */
|
||||
pages_per_block = 2UL << (20UL - PAGE_SHIFT);
|
||||
#endif
|
||||
pages_per_block = max_t(uint32_t, pages_per_block,
|
||||
pages_per_block = max_t(u32, pages_per_block,
|
||||
tbo->page_alignment);
|
||||
}
|
||||
|
||||
@@ -498,9 +499,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
|
||||
if (place->flags & TTM_PL_FLAG_TOPDOWN)
|
||||
vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
|
||||
|
||||
if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
|
||||
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
|
||||
vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
|
||||
|
||||
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED)
|
||||
vres->flags |= DRM_BUDDY_CLEAR_ALLOCATION;
|
||||
|
||||
if (fpfn || lpfn != mgr->mm.size)
|
||||
/* Allocate blocks in desired range */
|
||||
vres->flags |= DRM_BUDDY_RANGE_ALLOCATION;
|
||||
@@ -514,21 +518,31 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
|
||||
else
|
||||
min_block_size = mgr->default_page_size;
|
||||
|
||||
BUG_ON(min_block_size < mm->chunk_size);
|
||||
|
||||
/* Limit maximum size to 2GiB due to SG table limitations */
|
||||
size = min(remaining_size, 2ULL << 30);
|
||||
|
||||
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
|
||||
!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
|
||||
!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
|
||||
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
|
||||
|
||||
BUG_ON(min_block_size < mm->chunk_size);
|
||||
|
||||
r = drm_buddy_alloc_blocks(mm, fpfn,
|
||||
lpfn,
|
||||
size,
|
||||
min_block_size,
|
||||
&vres->blocks,
|
||||
vres->flags);
|
||||
|
||||
if (unlikely(r == -ENOSPC) && pages_per_block == ~0ul &&
|
||||
!(place->flags & TTM_PL_FLAG_CONTIGUOUS)) {
|
||||
vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
|
||||
pages_per_block = max_t(u32, 2UL << (20UL - PAGE_SHIFT),
|
||||
tbo->page_alignment);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (unlikely(r))
|
||||
goto error_free_blocks;
|
||||
|
||||
@@ -571,7 +585,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
|
||||
return 0;
|
||||
|
||||
error_free_blocks:
|
||||
drm_buddy_free_list(mm, &vres->blocks);
|
||||
drm_buddy_free_list(mm, &vres->blocks, 0);
|
||||
mutex_unlock(&mgr->lock);
|
||||
error_fini:
|
||||
ttm_resource_fini(man, &vres->base);
|
||||
@@ -604,7 +618,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man,
|
||||
|
||||
amdgpu_vram_mgr_do_reserve(man);
|
||||
|
||||
drm_buddy_free_list(mm, &vres->blocks);
|
||||
drm_buddy_free_list(mm, &vres->blocks, vres->flags);
|
||||
mutex_unlock(&mgr->lock);
|
||||
|
||||
atomic64_sub(vis_usage, &mgr->vis_usage);
|
||||
@@ -912,7 +926,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
|
||||
kfree(rsv);
|
||||
|
||||
list_for_each_entry_safe(rsv, temp, &mgr->reserved_pages, blocks) {
|
||||
drm_buddy_free_list(&mgr->mm, &rsv->allocated);
|
||||
drm_buddy_free_list(&mgr->mm, &rsv->allocated, 0);
|
||||
kfree(rsv);
|
||||
}
|
||||
if (!adev->gmc.is_app_apu)
|
||||
|
||||
@@ -53,10 +53,20 @@ static inline u64 amdgpu_vram_mgr_block_size(struct drm_buddy_block *block)
|
||||
return (u64)PAGE_SIZE << drm_buddy_block_order(block);
|
||||
}
|
||||
|
||||
static inline bool amdgpu_vram_mgr_is_cleared(struct drm_buddy_block *block)
|
||||
{
|
||||
return drm_buddy_block_is_clear(block);
|
||||
}
|
||||
|
||||
static inline struct amdgpu_vram_mgr_resource *
|
||||
to_amdgpu_vram_mgr_resource(struct ttm_resource *res)
|
||||
{
|
||||
return container_of(res, struct amdgpu_vram_mgr_resource, base);
|
||||
}
|
||||
|
||||
static inline void amdgpu_vram_mgr_set_cleared(struct ttm_resource *res)
|
||||
{
|
||||
to_amdgpu_vram_mgr_resource(res)->flags |= DRM_BUDDY_CLEARED;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1035,15 +1035,16 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data)
|
||||
static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
struct amdgpu_device *adev = handle->adev;
|
||||
struct aca_bank_info info;
|
||||
const char *error_str;
|
||||
u64 status;
|
||||
u64 status, count;
|
||||
int ret, ext_error_code;
|
||||
|
||||
ret = aca_bank_info_decode(bank, &report->info);
|
||||
ret = aca_bank_info_decode(bank, &info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -1055,15 +1056,28 @@ static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struc
|
||||
if (error_str)
|
||||
dev_info(adev->dev, "%s detected\n", error_str);
|
||||
|
||||
if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
|
||||
(type == ACA_ERROR_TYPE_CE && ext_error_code == 6))
|
||||
report->count[type] = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
|
||||
count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
|
||||
|
||||
return 0;
|
||||
switch (type) {
|
||||
case ACA_SMU_TYPE_UE:
|
||||
if (ext_error_code != 0 && ext_error_code != 9)
|
||||
count = 0ULL;
|
||||
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count);
|
||||
break;
|
||||
case ACA_SMU_TYPE_CE:
|
||||
count = ext_error_code == 6 ? count : 0ULL;
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, count);
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = {
|
||||
.aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report,
|
||||
.aca_bank_parser = xgmi_v6_4_0_aca_bank_parser,
|
||||
};
|
||||
|
||||
static const struct aca_info xgmi_v6_4_0_aca_info = {
|
||||
|
||||
@@ -44,6 +44,7 @@ struct amdgpu_hive_info {
|
||||
|
||||
struct amdgpu_reset_domain *reset_domain;
|
||||
atomic_t ras_recovery;
|
||||
struct ras_event_manager event_mgr;
|
||||
};
|
||||
|
||||
struct amdgpu_pcs_ras_field {
|
||||
|
||||
@@ -94,7 +94,8 @@ union amd_sriov_msg_feature_flags {
|
||||
uint32_t reg_indirect_acc : 1;
|
||||
uint32_t av1_support : 1;
|
||||
uint32_t vcn_rb_decouple : 1;
|
||||
uint32_t reserved : 24;
|
||||
uint32_t mes_info_enable : 1;
|
||||
uint32_t reserved : 23;
|
||||
} flags;
|
||||
uint32_t all;
|
||||
};
|
||||
@@ -157,7 +158,7 @@ struct amd_sriov_msg_pf2vf_info_header {
|
||||
uint32_t reserved[2];
|
||||
};
|
||||
|
||||
#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (48)
|
||||
#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (49)
|
||||
struct amd_sriov_msg_pf2vf_info {
|
||||
/* header contains size and version */
|
||||
struct amd_sriov_msg_pf2vf_info_header header;
|
||||
@@ -208,6 +209,8 @@ struct amd_sriov_msg_pf2vf_info {
|
||||
struct amd_sriov_msg_uuid_info uuid_info;
|
||||
/* PCIE atomic ops support flag */
|
||||
uint32_t pcie_atomic_ops_support_flags;
|
||||
/* Portion of GPU memory occupied by VF. MAX value is 65535, but set to uint32_t to maintain alignment with reserved size */
|
||||
uint32_t gpu_capacity;
|
||||
/* reserved */
|
||||
uint32_t reserved[256 - AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE];
|
||||
};
|
||||
@@ -221,7 +224,7 @@ struct amd_sriov_msg_vf2pf_info_header {
|
||||
uint32_t reserved[2];
|
||||
};
|
||||
|
||||
#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (70)
|
||||
#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (73)
|
||||
struct amd_sriov_msg_vf2pf_info {
|
||||
/* header contains size and version */
|
||||
struct amd_sriov_msg_vf2pf_info_header header;
|
||||
@@ -265,7 +268,9 @@ struct amd_sriov_msg_vf2pf_info {
|
||||
uint32_t version;
|
||||
} ucode_info[AMD_SRIOV_MSG_RESERVE_UCODE];
|
||||
uint64_t dummy_page_addr;
|
||||
|
||||
/* FB allocated for guest MES to record UQ info */
|
||||
uint64_t mes_info_addr;
|
||||
uint32_t mes_info_size;
|
||||
/* reserved */
|
||||
uint32_t reserved[256 - AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE];
|
||||
};
|
||||
|
||||
@@ -630,7 +630,7 @@ static int aqua_vanjaram_xcp_mgr_init(struct amdgpu_device *adev)
|
||||
|
||||
int aqua_vanjaram_init_soc_config(struct amdgpu_device *adev)
|
||||
{
|
||||
u32 mask, inst_mask = adev->sdma.sdma_mask;
|
||||
u32 mask, avail_inst, inst_mask = adev->sdma.sdma_mask;
|
||||
int ret, i;
|
||||
|
||||
/* generally 1 AID supports 4 instances */
|
||||
@@ -642,7 +642,9 @@ int aqua_vanjaram_init_soc_config(struct amdgpu_device *adev)
|
||||
|
||||
for (mask = (1 << adev->sdma.num_inst_per_aid) - 1; inst_mask;
|
||||
inst_mask >>= adev->sdma.num_inst_per_aid, ++i) {
|
||||
if ((inst_mask & mask) == mask)
|
||||
avail_inst = inst_mask & mask;
|
||||
if (avail_inst == mask || avail_inst == 0x3 ||
|
||||
avail_inst == 0xc)
|
||||
adev->aid_mask |= (1 << i);
|
||||
}
|
||||
|
||||
|
||||
@@ -1243,6 +1243,7 @@ static int amdgpu_atom_execute_table_locked(struct atom_context *ctx, int index,
|
||||
ectx.ps_size = params_size;
|
||||
ectx.abort = false;
|
||||
ectx.last_jump = 0;
|
||||
ectx.last_jump_jiffies = 0;
|
||||
if (ws) {
|
||||
ectx.ws = kcalloc(4, ws, GFP_KERNEL);
|
||||
ectx.ws_size = ws;
|
||||
|
||||
@@ -1375,14 +1375,14 @@ static int cik_asic_pci_config_reset(struct amdgpu_device *adev)
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool cik_asic_supports_baco(struct amdgpu_device *adev)
|
||||
static int cik_asic_supports_baco(struct amdgpu_device *adev)
|
||||
{
|
||||
switch (adev->asic_type) {
|
||||
case CHIP_BONAIRE:
|
||||
case CHIP_HAWAII:
|
||||
return amdgpu_dpm_is_baco_supported(adev);
|
||||
default:
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2210,6 +2210,8 @@ static const struct amd_ip_funcs cik_common_ip_funcs = {
|
||||
.soft_reset = cik_common_soft_reset,
|
||||
.set_clockgating_state = cik_common_set_clockgating_state,
|
||||
.set_powergating_state = cik_common_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ip_block_version cik_common_ip_block =
|
||||
|
||||
@@ -435,6 +435,8 @@ static const struct amd_ip_funcs cik_ih_ip_funcs = {
|
||||
.soft_reset = cik_ih_soft_reset,
|
||||
.set_clockgating_state = cik_ih_set_clockgating_state,
|
||||
.set_powergating_state = cik_ih_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ih_funcs cik_ih_funcs = {
|
||||
|
||||
@@ -1228,6 +1228,8 @@ static const struct amd_ip_funcs cik_sdma_ip_funcs = {
|
||||
.soft_reset = cik_sdma_soft_reset,
|
||||
.set_clockgating_state = cik_sdma_set_clockgating_state,
|
||||
.set_powergating_state = cik_sdma_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs cik_sdma_ring_funcs = {
|
||||
@@ -1290,7 +1292,7 @@ static void cik_sdma_set_irq_funcs(struct amdgpu_device *adev)
|
||||
* @src_offset: src GPU address
|
||||
* @dst_offset: dst GPU address
|
||||
* @byte_count: number of bytes to xfer
|
||||
* @tmz: is this a secure operation
|
||||
* @copy_flags: unused
|
||||
*
|
||||
* Copy GPU buffers using the DMA engine (CIK).
|
||||
* Used by the amdgpu ttm implementation to move pages if
|
||||
@@ -1300,7 +1302,7 @@ static void cik_sdma_emit_copy_buffer(struct amdgpu_ib *ib,
|
||||
uint64_t src_offset,
|
||||
uint64_t dst_offset,
|
||||
uint32_t byte_count,
|
||||
bool tmz)
|
||||
uint32_t copy_flags)
|
||||
{
|
||||
ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0);
|
||||
ib->ptr[ib->length_dw++] = byte_count;
|
||||
|
||||
@@ -433,6 +433,8 @@ static const struct amd_ip_funcs cz_ih_ip_funcs = {
|
||||
.soft_reset = cz_ih_soft_reset,
|
||||
.set_clockgating_state = cz_ih_set_clockgating_state,
|
||||
.set_powergating_state = cz_ih_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ih_funcs cz_ih_funcs = {
|
||||
|
||||
@@ -3333,6 +3333,8 @@ static const struct amd_ip_funcs dce_v10_0_ip_funcs = {
|
||||
.soft_reset = dce_v10_0_soft_reset,
|
||||
.set_clockgating_state = dce_v10_0_set_clockgating_state,
|
||||
.set_powergating_state = dce_v10_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static void
|
||||
|
||||
@@ -3464,6 +3464,8 @@ static const struct amd_ip_funcs dce_v11_0_ip_funcs = {
|
||||
.soft_reset = dce_v11_0_soft_reset,
|
||||
.set_clockgating_state = dce_v11_0_set_clockgating_state,
|
||||
.set_powergating_state = dce_v11_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static void
|
||||
|
||||
@@ -3154,6 +3154,8 @@ static const struct amd_ip_funcs dce_v6_0_ip_funcs = {
|
||||
.soft_reset = dce_v6_0_soft_reset,
|
||||
.set_clockgating_state = dce_v6_0_set_clockgating_state,
|
||||
.set_powergating_state = dce_v6_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static void
|
||||
|
||||
@@ -3242,6 +3242,8 @@ static const struct amd_ip_funcs dce_v8_0_ip_funcs = {
|
||||
.soft_reset = dce_v8_0_soft_reset,
|
||||
.set_clockgating_state = dce_v8_0_set_clockgating_state,
|
||||
.set_powergating_state = dce_v8_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static void
|
||||
|
||||
@@ -276,6 +276,99 @@ MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec.bin");
|
||||
MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec2.bin");
|
||||
MODULE_FIRMWARE("amdgpu/gc_10_3_7_rlc.bin");
|
||||
|
||||
static const struct amdgpu_hwip_reg_entry gc_reg_list_10_1[] = {
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS2),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS3),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT1),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT2),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STALLED_STAT1),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STALLED_STAT1),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_BUSY_STAT),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT2),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT2),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_ERROR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_HPD_STATUS0),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_BASE),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_RPTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_WPTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_BASE),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_RPTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_WPTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_BASE),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_RPTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_WPTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_BASE),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_CMD_BUFSZ),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_CMD_BUFSZ),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_CMD_BUFSZ),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_CMD_BUFSZ),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_LO),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_HI),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BUFSZ),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_LO),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_HI),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BUFSZ),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_LO),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_HI),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BUFSZ),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_LO),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_HI),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BUFSZ),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCPF_UTCL1_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCPC_UTCL1_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCPG_UTCL1_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmGDS_PROTECTION_FAULT),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmGDS_VM_PROTECTION_FAULT),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS_2),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmPA_CL_CNTL_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_UTCL1_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRMI_UTCL1_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmSQC_DCACHE_UTCL0_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmSQC_ICACHE_UTCL0_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmSQG_UTCL0_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmTCP_UTCL0_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmWD_UTCL1_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_DEBUG),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_CNTL),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_CNTL),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_INSTR_PNTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC1_INSTR_PNTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC2_INSTR_PNTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_INSTR_PNTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_ME_INSTR_PNTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_PFP_INSTR_PNTR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_STAT),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_COMMAND),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_MESSAGE),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_1),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_2),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_3),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_4),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmSMU_RLC_RESPONSE),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SAFE_MODE),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_SAFE_MODE),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_GPM_STAT_2),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SPP_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_BOOTLOAD_STATUS),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_INT_STAT),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_GENERAL_6),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_A),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_B),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_ADDR),
|
||||
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_LX6_CORE_PDEBUG_INST)
|
||||
};
|
||||
|
||||
static const struct soc15_reg_golden golden_settings_gc_10_1[] = {
|
||||
SOC15_REG_GOLDEN_VALUE(GC, 0, mmCB_HW_CONTROL_4, 0xffffffff, 0x00400014),
|
||||
SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_CPF_CLK_CTRL, 0xfcff8fff, 0xf8000100),
|
||||
@@ -3964,7 +4057,7 @@ static void gfx_v10_0_check_gfxoff_flag(struct amdgpu_device *adev)
|
||||
|
||||
static int gfx_v10_0_init_microcode(struct amdgpu_device *adev)
|
||||
{
|
||||
char fw_name[40];
|
||||
char fw_name[53];
|
||||
char ucode_prefix[30];
|
||||
const char *wks = "";
|
||||
int err;
|
||||
@@ -4490,6 +4583,22 @@ static int gfx_v10_0_compute_ring_init(struct amdgpu_device *adev, int ring_id,
|
||||
hw_prio, NULL);
|
||||
}
|
||||
|
||||
static void gfx_v10_0_alloc_dump_mem(struct amdgpu_device *adev)
|
||||
{
|
||||
uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
|
||||
uint32_t *ptr;
|
||||
|
||||
ptr = kcalloc(reg_count, sizeof(uint32_t), GFP_KERNEL);
|
||||
if (ptr == NULL) {
|
||||
DRM_ERROR("Failed to allocate memory for IP Dump\n");
|
||||
adev->gfx.ip_dump = NULL;
|
||||
adev->gfx.reg_count = 0;
|
||||
} else {
|
||||
adev->gfx.ip_dump = ptr;
|
||||
adev->gfx.reg_count = reg_count;
|
||||
}
|
||||
}
|
||||
|
||||
static int gfx_v10_0_sw_init(void *handle)
|
||||
{
|
||||
int i, j, k, r, ring_id = 0;
|
||||
@@ -4518,7 +4627,7 @@ static int gfx_v10_0_sw_init(void *handle)
|
||||
case IP_VERSION(10, 3, 3):
|
||||
case IP_VERSION(10, 3, 7):
|
||||
adev->gfx.me.num_me = 1;
|
||||
adev->gfx.me.num_pipe_per_me = 1;
|
||||
adev->gfx.me.num_pipe_per_me = 2;
|
||||
adev->gfx.me.num_queue_per_pipe = 1;
|
||||
adev->gfx.mec.num_mec = 2;
|
||||
adev->gfx.mec.num_pipe_per_mec = 4;
|
||||
@@ -4642,6 +4751,8 @@ static int gfx_v10_0_sw_init(void *handle)
|
||||
|
||||
gfx_v10_0_gpu_early_init(adev);
|
||||
|
||||
gfx_v10_0_alloc_dump_mem(adev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -4694,6 +4805,8 @@ static int gfx_v10_0_sw_fini(void *handle)
|
||||
|
||||
gfx_v10_0_free_microcode(adev);
|
||||
|
||||
kfree(adev->gfx.ip_dump);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -8317,7 +8430,7 @@ static void gfx_v10_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
|
||||
}
|
||||
reg_mem_engine = 0;
|
||||
} else {
|
||||
ref_and_mask = nbio_hf_reg->ref_and_mask_cp0;
|
||||
ref_and_mask = nbio_hf_reg->ref_and_mask_cp0 << ring->pipe;
|
||||
reg_mem_engine = 1; /* pfp */
|
||||
}
|
||||
|
||||
@@ -9154,6 +9267,36 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
|
||||
amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
|
||||
}
|
||||
|
||||
static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||
uint32_t i;
|
||||
uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
|
||||
|
||||
if (!adev->gfx.ip_dump)
|
||||
return;
|
||||
|
||||
for (i = 0; i < reg_count; i++)
|
||||
drm_printf(p, "%-50s \t 0x%08x\n",
|
||||
gc_reg_list_10_1[i].reg_name,
|
||||
adev->gfx.ip_dump[i]);
|
||||
}
|
||||
|
||||
static void gfx_v10_ip_dump(void *handle)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||
uint32_t i;
|
||||
uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
|
||||
|
||||
if (!adev->gfx.ip_dump)
|
||||
return;
|
||||
|
||||
amdgpu_gfx_off_ctrl(adev, false);
|
||||
for (i = 0; i < reg_count; i++)
|
||||
adev->gfx.ip_dump[i] = RREG32(SOC15_REG_ENTRY_OFFSET(gc_reg_list_10_1[i]));
|
||||
amdgpu_gfx_off_ctrl(adev, true);
|
||||
}
|
||||
|
||||
static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {
|
||||
.name = "gfx_v10_0",
|
||||
.early_init = gfx_v10_0_early_init,
|
||||
@@ -9170,6 +9313,8 @@ static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {
|
||||
.set_clockgating_state = gfx_v10_0_set_clockgating_state,
|
||||
.set_powergating_state = gfx_v10_0_set_powergating_state,
|
||||
.get_clockgating_state = gfx_v10_0_get_clockgating_state,
|
||||
.dump_ip_state = gfx_v10_ip_dump,
|
||||
.print_ip_state = gfx_v10_ip_print,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
|
||||
|
||||
@@ -510,7 +510,7 @@ static void gfx_v11_0_check_fw_cp_gfx_shadow(struct amdgpu_device *adev)
|
||||
static int gfx_v11_0_init_microcode(struct amdgpu_device *adev)
|
||||
{
|
||||
char fw_name[40];
|
||||
char ucode_prefix[30];
|
||||
char ucode_prefix[25];
|
||||
int err;
|
||||
const struct rlc_firmware_header_v2_0 *rlc_hdr;
|
||||
uint16_t version_major;
|
||||
@@ -4506,14 +4506,11 @@ static int gfx_v11_0_soft_reset(void *handle)
|
||||
|
||||
gfx_v11_0_set_safe_mode(adev, 0);
|
||||
|
||||
mutex_lock(&adev->srbm_mutex);
|
||||
for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
|
||||
for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) {
|
||||
for (k = 0; k < adev->gfx.mec.num_pipe_per_mec; k++) {
|
||||
tmp = RREG32_SOC15(GC, 0, regGRBM_GFX_CNTL);
|
||||
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, MEID, i);
|
||||
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, QUEUEID, j);
|
||||
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, PIPEID, k);
|
||||
WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, tmp);
|
||||
soc21_grbm_select(adev, i, k, j, 0);
|
||||
|
||||
WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2);
|
||||
WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1);
|
||||
@@ -4523,16 +4520,14 @@ static int gfx_v11_0_soft_reset(void *handle)
|
||||
for (i = 0; i < adev->gfx.me.num_me; ++i) {
|
||||
for (j = 0; j < adev->gfx.me.num_queue_per_pipe; j++) {
|
||||
for (k = 0; k < adev->gfx.me.num_pipe_per_me; k++) {
|
||||
tmp = RREG32_SOC15(GC, 0, regGRBM_GFX_CNTL);
|
||||
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, MEID, i);
|
||||
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, QUEUEID, j);
|
||||
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, PIPEID, k);
|
||||
WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, tmp);
|
||||
soc21_grbm_select(adev, i, k, j, 0);
|
||||
|
||||
WREG32_SOC15(GC, 0, regCP_GFX_HQD_DEQUEUE_REQUEST, 0x1);
|
||||
}
|
||||
}
|
||||
}
|
||||
soc21_grbm_select(adev, 0, 0, 0, 0);
|
||||
mutex_unlock(&adev->srbm_mutex);
|
||||
|
||||
/* Try to acquire the gfx mutex before access to CP_VMID_RESET */
|
||||
r = gfx_v11_0_request_gfx_index_mutex(adev, 1);
|
||||
@@ -6174,6 +6169,8 @@ static const struct amd_ip_funcs gfx_v11_0_ip_funcs = {
|
||||
.set_clockgating_state = gfx_v11_0_set_clockgating_state,
|
||||
.set_powergating_state = gfx_v11_0_set_powergating_state,
|
||||
.get_clockgating_state = gfx_v11_0_get_clockgating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_gfx = {
|
||||
|
||||
@@ -3457,6 +3457,8 @@ static const struct amd_ip_funcs gfx_v6_0_ip_funcs = {
|
||||
.soft_reset = gfx_v6_0_soft_reset,
|
||||
.set_clockgating_state = gfx_v6_0_set_clockgating_state,
|
||||
.set_powergating_state = gfx_v6_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs gfx_v6_0_ring_funcs_gfx = {
|
||||
|
||||
@@ -4977,6 +4977,8 @@ static const struct amd_ip_funcs gfx_v7_0_ip_funcs = {
|
||||
.soft_reset = gfx_v7_0_soft_reset,
|
||||
.set_clockgating_state = gfx_v7_0_set_clockgating_state,
|
||||
.set_powergating_state = gfx_v7_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
|
||||
|
||||
@@ -6878,6 +6878,8 @@ static const struct amd_ip_funcs gfx_v8_0_ip_funcs = {
|
||||
.set_clockgating_state = gfx_v8_0_set_clockgating_state,
|
||||
.set_powergating_state = gfx_v8_0_set_powergating_state,
|
||||
.get_clockgating_state = gfx_v8_0_get_clockgating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
|
||||
|
||||
@@ -1249,7 +1249,7 @@ static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev)
|
||||
static int gfx_v9_0_init_cp_gfx_microcode(struct amdgpu_device *adev,
|
||||
char *chip_name)
|
||||
{
|
||||
char fw_name[30];
|
||||
char fw_name[50];
|
||||
int err;
|
||||
|
||||
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_pfp.bin", chip_name);
|
||||
@@ -1282,7 +1282,7 @@ out:
|
||||
static int gfx_v9_0_init_rlc_microcode(struct amdgpu_device *adev,
|
||||
char *chip_name)
|
||||
{
|
||||
char fw_name[30];
|
||||
char fw_name[53];
|
||||
int err;
|
||||
const struct rlc_firmware_header_v2_0 *rlc_hdr;
|
||||
uint16_t version_major;
|
||||
@@ -1337,7 +1337,7 @@ static bool gfx_v9_0_load_mec2_fw_bin_support(struct amdgpu_device *adev)
|
||||
static int gfx_v9_0_init_cp_compute_microcode(struct amdgpu_device *adev,
|
||||
char *chip_name)
|
||||
{
|
||||
char fw_name[30];
|
||||
char fw_name[50];
|
||||
int err;
|
||||
|
||||
if (amdgpu_sriov_vf(adev) && (adev->asic_type == CHIP_ALDEBARAN))
|
||||
@@ -6856,6 +6856,8 @@ static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
|
||||
.set_clockgating_state = gfx_v9_0_set_clockgating_state,
|
||||
.set_powergating_state = gfx_v9_0_set_powergating_state,
|
||||
.get_clockgating_state = gfx_v9_0_get_clockgating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
|
||||
|
||||
@@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
|
||||
mutex_unlock(&adev->grbm_idx_mutex);
|
||||
}
|
||||
|
||||
static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
|
||||
{
|
||||
u32 status = 0;
|
||||
struct amdgpu_vmhub *hub;
|
||||
|
||||
hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
|
||||
status = RREG32(hub->vm_l2_pro_fault_status);
|
||||
/* reset page fault status */
|
||||
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
|
||||
|
||||
return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
}
|
||||
|
||||
struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = {
|
||||
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
|
||||
@@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
|
||||
.hw_ops = &gfx_v9_4_2_ras_ops,
|
||||
},
|
||||
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
|
||||
.query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
|
||||
};
|
||||
|
||||
@@ -431,16 +431,16 @@ out:
|
||||
|
||||
static int gfx_v9_4_3_init_microcode(struct amdgpu_device *adev)
|
||||
{
|
||||
const char *chip_name;
|
||||
char ucode_prefix[15];
|
||||
int r;
|
||||
|
||||
chip_name = "gc_9_4_3";
|
||||
amdgpu_ucode_ip_version_decode(adev, GC_HWIP, ucode_prefix, sizeof(ucode_prefix));
|
||||
|
||||
r = gfx_v9_4_3_init_rlc_microcode(adev, chip_name);
|
||||
r = gfx_v9_4_3_init_rlc_microcode(adev, ucode_prefix);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
r = gfx_v9_4_3_init_cp_compute_microcode(adev, chip_name);
|
||||
r = gfx_v9_4_3_init_cp_compute_microcode(adev, ucode_prefix);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
@@ -680,38 +680,44 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_3_gfx_funcs = {
|
||||
.ih_node_to_logical_xcc = &gfx_v9_4_3_ih_to_xcc_inst,
|
||||
};
|
||||
|
||||
static int gfx_v9_4_3_aca_bank_generate_report(struct aca_handle *handle,
|
||||
struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data)
|
||||
static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
|
||||
struct aca_bank *bank, enum aca_smu_type type,
|
||||
void *data)
|
||||
{
|
||||
u64 status, misc0;
|
||||
struct aca_bank_info info;
|
||||
u64 misc0;
|
||||
u32 instlo;
|
||||
int ret;
|
||||
|
||||
status = bank->regs[ACA_REG_IDX_STATUS];
|
||||
if ((type == ACA_ERROR_TYPE_UE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
|
||||
(type == ACA_ERROR_TYPE_CE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
|
||||
ret = aca_bank_info_decode(bank, &info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = aca_bank_info_decode(bank, &report->info);
|
||||
if (ret)
|
||||
return ret;
|
||||
/* NOTE: overwrite info.die_id with xcd id for gfx */
|
||||
instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
|
||||
instlo &= GENMASK(31, 1);
|
||||
info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
|
||||
|
||||
/* NOTE: overwrite info.die_id with xcd id for gfx */
|
||||
instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
|
||||
instlo &= GENMASK(31, 1);
|
||||
report->info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
|
||||
switch (type) {
|
||||
case ACA_SMU_TYPE_UE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info,
|
||||
ACA_ERROR_TYPE_UE, 1ULL);
|
||||
break;
|
||||
case ACA_SMU_TYPE_CE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info,
|
||||
ACA_ERROR_TYPE_CE, ACA_REG__MISC0__ERRCNT(misc0));
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
u32 instlo;
|
||||
|
||||
@@ -730,7 +736,7 @@ static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_b
|
||||
}
|
||||
|
||||
static const struct aca_bank_ops gfx_v9_4_3_aca_bank_ops = {
|
||||
.aca_bank_generate_report = gfx_v9_4_3_aca_bank_generate_report,
|
||||
.aca_bank_parser = gfx_v9_4_3_aca_bank_parser,
|
||||
.aca_bank_is_valid = gfx_v9_4_3_aca_bank_is_valid,
|
||||
};
|
||||
|
||||
@@ -2398,10 +2404,10 @@ gfx_v9_4_3_xcc_update_coarse_grain_clock_gating(struct amdgpu_device *adev,
|
||||
if (def != data)
|
||||
WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
|
||||
|
||||
/* enable cgcg FSM(0x0000363F) */
|
||||
/* CGCG Hysteresis: 400us */
|
||||
def = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL);
|
||||
|
||||
data = (0x36
|
||||
data = (0x2710
|
||||
<< RLC_CGCG_CGLS_CTRL__CGCG_GFX_IDLE_THRESHOLD__SHIFT) |
|
||||
RLC_CGCG_CGLS_CTRL__CGCG_EN_MASK;
|
||||
if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGLS)
|
||||
@@ -2410,10 +2416,10 @@ gfx_v9_4_3_xcc_update_coarse_grain_clock_gating(struct amdgpu_device *adev,
|
||||
if (def != data)
|
||||
WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL, data);
|
||||
|
||||
/* set IDLE_POLL_COUNT(0x00900100) */
|
||||
/* set IDLE_POLL_COUNT(0x33450100)*/
|
||||
def = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL);
|
||||
data = (0x0100 << CP_RB_WPTR_POLL_CNTL__POLL_FREQUENCY__SHIFT) |
|
||||
(0x0090 << CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT__SHIFT);
|
||||
(0x3345 << CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT__SHIFT);
|
||||
if (def != data)
|
||||
WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL, data);
|
||||
} else {
|
||||
@@ -4010,6 +4016,8 @@ static const struct amd_ip_funcs gfx_v9_4_3_ip_funcs = {
|
||||
.set_clockgating_state = gfx_v9_4_3_set_clockgating_state,
|
||||
.set_powergating_state = gfx_v9_4_3_set_powergating_state,
|
||||
.get_clockgating_state = gfx_v9_4_3_get_clockgating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs gfx_v9_4_3_ring_funcs_compute = {
|
||||
|
||||
@@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev)
|
||||
mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32;
|
||||
}
|
||||
|
||||
static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int xcc_id)
|
||||
{
|
||||
u32 status = 0;
|
||||
struct amdgpu_vmhub *hub;
|
||||
|
||||
if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2))
|
||||
return false;
|
||||
|
||||
hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
|
||||
status = RREG32(hub->vm_l2_pro_fault_status);
|
||||
/* reset page fault status */
|
||||
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
|
||||
|
||||
return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
}
|
||||
|
||||
const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
|
||||
.get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset,
|
||||
@@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
|
||||
.set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default,
|
||||
.init = gfxhub_v1_0_init,
|
||||
.get_xgmi_info = gfxhub_v1_1_get_xgmi_info,
|
||||
.query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status,
|
||||
};
|
||||
|
||||
@@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device *adev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int xcc_id)
|
||||
{
|
||||
u32 fed, status;
|
||||
|
||||
status = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regVM_L2_PROTECTION_FAULT_STATUS);
|
||||
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
/* reset page fault status */
|
||||
WREG32_P(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id),
|
||||
regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
|
||||
|
||||
return fed;
|
||||
}
|
||||
|
||||
const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
|
||||
.get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset,
|
||||
.setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs,
|
||||
@@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
|
||||
.set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default,
|
||||
.init = gfxhub_v1_2_init,
|
||||
.get_xgmi_info = gfxhub_v1_2_get_xgmi_info,
|
||||
.query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status,
|
||||
};
|
||||
|
||||
static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask)
|
||||
|
||||
@@ -1115,6 +1115,8 @@ static const struct amd_ip_funcs gmc_v6_0_ip_funcs = {
|
||||
.soft_reset = gmc_v6_0_soft_reset,
|
||||
.set_clockgating_state = gmc_v6_0_set_clockgating_state,
|
||||
.set_powergating_state = gmc_v6_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_gmc_funcs gmc_v6_0_gmc_funcs = {
|
||||
|
||||
@@ -1354,6 +1354,8 @@ static const struct amd_ip_funcs gmc_v7_0_ip_funcs = {
|
||||
.soft_reset = gmc_v7_0_soft_reset,
|
||||
.set_clockgating_state = gmc_v7_0_set_clockgating_state,
|
||||
.set_powergating_state = gmc_v7_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_gmc_funcs gmc_v7_0_gmc_funcs = {
|
||||
|
||||
@@ -1717,6 +1717,8 @@ static const struct amd_ip_funcs gmc_v8_0_ip_funcs = {
|
||||
.set_clockgating_state = gmc_v8_0_set_clockgating_state,
|
||||
.set_powergating_state = gmc_v8_0_set_powergating_state,
|
||||
.get_clockgating_state = gmc_v8_0_get_clockgating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_gmc_funcs gmc_v8_0_gmc_funcs = {
|
||||
|
||||
@@ -548,7 +548,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
|
||||
{
|
||||
bool retry_fault = !!(entry->src_data[1] & 0x80);
|
||||
bool write_fault = !!(entry->src_data[1] & 0x20);
|
||||
uint32_t status = 0, cid = 0, rw = 0;
|
||||
uint32_t status = 0, cid = 0, rw = 0, fed = 0;
|
||||
struct amdgpu_task_info *task_info;
|
||||
struct amdgpu_vmhub *hub;
|
||||
const char *mmhub_cid;
|
||||
@@ -664,6 +664,13 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
|
||||
status = RREG32(hub->vm_l2_pro_fault_status);
|
||||
cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
|
||||
rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
|
||||
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
|
||||
/* for fed error, kfd will handle it, return directly */
|
||||
if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
|
||||
(amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)))
|
||||
return 0;
|
||||
|
||||
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
|
||||
|
||||
amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
|
||||
@@ -1450,7 +1457,6 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
|
||||
adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET;
|
||||
adev->umc.active_mask = adev->aid_mask;
|
||||
adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
|
||||
adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0];
|
||||
if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
|
||||
adev->umc.ras = &umc_v12_0_ras;
|
||||
break;
|
||||
|
||||
@@ -425,6 +425,8 @@ static const struct amd_ip_funcs iceland_ih_ip_funcs = {
|
||||
.soft_reset = iceland_ih_soft_reset,
|
||||
.set_clockgating_state = iceland_ih_set_clockgating_state,
|
||||
.set_powergating_state = iceland_ih_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ih_funcs iceland_ih_funcs = {
|
||||
|
||||
@@ -346,6 +346,21 @@ static int ih_v6_0_irq_init(struct amdgpu_device *adev)
|
||||
DELAY, 3);
|
||||
WREG32_SOC15(OSSSYS, 0, regIH_MSI_STORM_CTRL, tmp);
|
||||
|
||||
/* Redirect the interrupts to IH RB1 for dGPU */
|
||||
if (adev->irq.ih1.ring_size) {
|
||||
tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX);
|
||||
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_INDEX, INDEX, 0);
|
||||
WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX, tmp);
|
||||
|
||||
tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA);
|
||||
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, CLIENT_ID, 0xa);
|
||||
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, SOURCE_ID, 0x0);
|
||||
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA,
|
||||
SOURCE_ID_MATCH_ENABLE, 0x1);
|
||||
|
||||
WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA, tmp);
|
||||
}
|
||||
|
||||
pci_set_master(adev->pdev);
|
||||
|
||||
/* enable interrupts */
|
||||
@@ -549,8 +564,15 @@ static int ih_v6_0_sw_init(void *handle)
|
||||
adev->irq.ih.use_doorbell = true;
|
||||
adev->irq.ih.doorbell_index = adev->doorbell_index.ih << 1;
|
||||
|
||||
adev->irq.ih1.ring_size = 0;
|
||||
adev->irq.ih2.ring_size = 0;
|
||||
if (!(adev->flags & AMD_IS_APU)) {
|
||||
r = amdgpu_ih_ring_init(adev, &adev->irq.ih1, IH_RING_SIZE,
|
||||
use_bus_addr);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
adev->irq.ih1.use_doorbell = true;
|
||||
adev->irq.ih1.doorbell_index = (adev->doorbell_index.ih + 1) << 1;
|
||||
}
|
||||
|
||||
/* initialize ih control register offset */
|
||||
ih_v6_0_init_register_offset(adev);
|
||||
@@ -748,6 +770,8 @@ static const struct amd_ip_funcs ih_v6_0_ip_funcs = {
|
||||
.set_clockgating_state = ih_v6_0_set_clockgating_state,
|
||||
.set_powergating_state = ih_v6_0_set_powergating_state,
|
||||
.get_clockgating_state = ih_v6_0_get_clockgating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ih_funcs ih_v6_0_funcs = {
|
||||
|
||||
@@ -346,6 +346,21 @@ static int ih_v6_1_irq_init(struct amdgpu_device *adev)
|
||||
DELAY, 3);
|
||||
WREG32_SOC15(OSSSYS, 0, regIH_MSI_STORM_CTRL, tmp);
|
||||
|
||||
/* Redirect the interrupts to IH RB1 for dGPU */
|
||||
if (adev->irq.ih1.ring_size) {
|
||||
tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX);
|
||||
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_INDEX, INDEX, 0);
|
||||
WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX, tmp);
|
||||
|
||||
tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA);
|
||||
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, CLIENT_ID, 0xa);
|
||||
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, SOURCE_ID, 0x0);
|
||||
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA,
|
||||
SOURCE_ID_MATCH_ENABLE, 0x1);
|
||||
|
||||
WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA, tmp);
|
||||
}
|
||||
|
||||
pci_set_master(adev->pdev);
|
||||
|
||||
/* enable interrupts */
|
||||
@@ -550,8 +565,15 @@ static int ih_v6_1_sw_init(void *handle)
|
||||
adev->irq.ih.use_doorbell = true;
|
||||
adev->irq.ih.doorbell_index = adev->doorbell_index.ih << 1;
|
||||
|
||||
adev->irq.ih1.ring_size = 0;
|
||||
adev->irq.ih2.ring_size = 0;
|
||||
if (!(adev->flags & AMD_IS_APU)) {
|
||||
r = amdgpu_ih_ring_init(adev, &adev->irq.ih1, IH_RING_SIZE,
|
||||
use_bus_addr);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
adev->irq.ih1.use_doorbell = true;
|
||||
adev->irq.ih1.doorbell_index = (adev->doorbell_index.ih + 1) << 1;
|
||||
}
|
||||
|
||||
/* initialize ih control register offset */
|
||||
ih_v6_1_init_register_offset(adev);
|
||||
@@ -753,6 +775,8 @@ static const struct amd_ip_funcs ih_v6_1_ip_funcs = {
|
||||
.set_clockgating_state = ih_v6_1_set_clockgating_state,
|
||||
.set_powergating_state = ih_v6_1_set_powergating_state,
|
||||
.get_clockgating_state = ih_v6_1_get_clockgating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ih_funcs ih_v6_1_funcs = {
|
||||
|
||||
@@ -749,6 +749,8 @@ static const struct amd_ip_funcs ih_v7_0_ip_funcs = {
|
||||
.set_clockgating_state = ih_v7_0_set_clockgating_state,
|
||||
.set_powergating_state = ih_v7_0_set_powergating_state,
|
||||
.get_clockgating_state = ih_v7_0_get_clockgating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ih_funcs ih_v7_0_funcs = {
|
||||
|
||||
@@ -759,6 +759,8 @@ static const struct amd_ip_funcs jpeg_v2_0_ip_funcs = {
|
||||
.post_soft_reset = NULL,
|
||||
.set_clockgating_state = jpeg_v2_0_set_clockgating_state,
|
||||
.set_powergating_state = jpeg_v2_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs jpeg_v2_0_dec_ring_vm_funcs = {
|
||||
|
||||
@@ -632,6 +632,8 @@ static const struct amd_ip_funcs jpeg_v2_5_ip_funcs = {
|
||||
.post_soft_reset = NULL,
|
||||
.set_clockgating_state = jpeg_v2_5_set_clockgating_state,
|
||||
.set_powergating_state = jpeg_v2_5_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amd_ip_funcs jpeg_v2_6_ip_funcs = {
|
||||
@@ -652,6 +654,8 @@ static const struct amd_ip_funcs jpeg_v2_6_ip_funcs = {
|
||||
.post_soft_reset = NULL,
|
||||
.set_clockgating_state = jpeg_v2_5_set_clockgating_state,
|
||||
.set_powergating_state = jpeg_v2_5_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs jpeg_v2_5_dec_ring_vm_funcs = {
|
||||
|
||||
@@ -557,6 +557,8 @@ static const struct amd_ip_funcs jpeg_v3_0_ip_funcs = {
|
||||
.post_soft_reset = NULL,
|
||||
.set_clockgating_state = jpeg_v3_0_set_clockgating_state,
|
||||
.set_powergating_state = jpeg_v3_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs jpeg_v3_0_dec_ring_vm_funcs = {
|
||||
|
||||
@@ -719,6 +719,8 @@ static const struct amd_ip_funcs jpeg_v4_0_ip_funcs = {
|
||||
.post_soft_reset = NULL,
|
||||
.set_clockgating_state = jpeg_v4_0_set_clockgating_state,
|
||||
.set_powergating_state = jpeg_v4_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs jpeg_v4_0_dec_ring_vm_funcs = {
|
||||
|
||||
@@ -1053,6 +1053,8 @@ static const struct amd_ip_funcs jpeg_v4_0_3_ip_funcs = {
|
||||
.post_soft_reset = NULL,
|
||||
.set_clockgating_state = jpeg_v4_0_3_set_clockgating_state,
|
||||
.set_powergating_state = jpeg_v4_0_3_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs jpeg_v4_0_3_dec_ring_vm_funcs = {
|
||||
|
||||
@@ -762,6 +762,8 @@ static const struct amd_ip_funcs jpeg_v4_0_5_ip_funcs = {
|
||||
.post_soft_reset = NULL,
|
||||
.set_clockgating_state = jpeg_v4_0_5_set_clockgating_state,
|
||||
.set_powergating_state = jpeg_v4_0_5_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs jpeg_v4_0_5_dec_ring_vm_funcs = {
|
||||
|
||||
@@ -513,6 +513,8 @@ static const struct amd_ip_funcs jpeg_v5_0_0_ip_funcs = {
|
||||
.post_soft_reset = NULL,
|
||||
.set_clockgating_state = jpeg_v5_0_0_set_clockgating_state,
|
||||
.set_powergating_state = jpeg_v5_0_0_set_powergating_state,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ring_funcs jpeg_v5_0_0_dec_ring_vm_funcs = {
|
||||
|
||||
@@ -1176,6 +1176,8 @@ static const struct amd_ip_funcs mes_v10_1_ip_funcs = {
|
||||
.hw_fini = mes_v10_1_hw_fini,
|
||||
.suspend = mes_v10_1_suspend,
|
||||
.resume = mes_v10_1_resume,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
const struct amdgpu_ip_block_version mes_v10_1_ip_block = {
|
||||
|
||||
@@ -100,18 +100,76 @@ static const struct amdgpu_ring_funcs mes_v11_0_ring_funcs = {
|
||||
.insert_nop = amdgpu_ring_insert_nop,
|
||||
};
|
||||
|
||||
static const char *mes_v11_0_opcodes[] = {
|
||||
"SET_HW_RSRC",
|
||||
"SET_SCHEDULING_CONFIG",
|
||||
"ADD_QUEUE",
|
||||
"REMOVE_QUEUE",
|
||||
"PERFORM_YIELD",
|
||||
"SET_GANG_PRIORITY_LEVEL",
|
||||
"SUSPEND",
|
||||
"RESUME",
|
||||
"RESET",
|
||||
"SET_LOG_BUFFER",
|
||||
"CHANGE_GANG_PRORITY",
|
||||
"QUERY_SCHEDULER_STATUS",
|
||||
"PROGRAM_GDS",
|
||||
"SET_DEBUG_VMID",
|
||||
"MISC",
|
||||
"UPDATE_ROOT_PAGE_TABLE",
|
||||
"AMD_LOG",
|
||||
};
|
||||
|
||||
static const char *mes_v11_0_misc_opcodes[] = {
|
||||
"WRITE_REG",
|
||||
"INV_GART",
|
||||
"QUERY_STATUS",
|
||||
"READ_REG",
|
||||
"WAIT_REG_MEM",
|
||||
"SET_SHADER_DEBUGGER",
|
||||
};
|
||||
|
||||
static const char *mes_v11_0_get_op_string(union MESAPI__MISC *x_pkt)
|
||||
{
|
||||
const char *op_str = NULL;
|
||||
|
||||
if (x_pkt->header.opcode < ARRAY_SIZE(mes_v11_0_opcodes))
|
||||
op_str = mes_v11_0_opcodes[x_pkt->header.opcode];
|
||||
|
||||
return op_str;
|
||||
}
|
||||
|
||||
static const char *mes_v11_0_get_misc_op_string(union MESAPI__MISC *x_pkt)
|
||||
{
|
||||
const char *op_str = NULL;
|
||||
|
||||
if ((x_pkt->header.opcode == MES_SCH_API_MISC) &&
|
||||
(x_pkt->opcode < ARRAY_SIZE(mes_v11_0_misc_opcodes)))
|
||||
op_str = mes_v11_0_misc_opcodes[x_pkt->opcode];
|
||||
|
||||
return op_str;
|
||||
}
|
||||
|
||||
static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
|
||||
void *pkt, int size,
|
||||
int api_status_off)
|
||||
{
|
||||
int ndw = size / 4;
|
||||
signed long r;
|
||||
union MESAPI__ADD_QUEUE *x_pkt = pkt;
|
||||
union MESAPI__MISC *x_pkt = pkt;
|
||||
struct MES_API_STATUS *api_status;
|
||||
struct amdgpu_device *adev = mes->adev;
|
||||
struct amdgpu_ring *ring = &mes->ring;
|
||||
unsigned long flags;
|
||||
signed long timeout = adev->usec_timeout;
|
||||
signed long timeout = 3000000; /* 3000 ms */
|
||||
const char *op_str, *misc_op_str;
|
||||
u32 fence_offset;
|
||||
u64 fence_gpu_addr;
|
||||
u64 *fence_ptr;
|
||||
int ret;
|
||||
|
||||
if (x_pkt->header.opcode >= MES_SCH_API_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
if (amdgpu_emu_mode) {
|
||||
timeout *= 100;
|
||||
@@ -121,27 +179,52 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
|
||||
}
|
||||
BUG_ON(size % 4 != 0);
|
||||
|
||||
ret = amdgpu_device_wb_get(adev, &fence_offset);
|
||||
if (ret)
|
||||
return ret;
|
||||
fence_gpu_addr =
|
||||
adev->wb.gpu_addr + (fence_offset * 4);
|
||||
fence_ptr = (u64 *)&adev->wb.wb[fence_offset];
|
||||
*fence_ptr = 0;
|
||||
|
||||
spin_lock_irqsave(&mes->ring_lock, flags);
|
||||
if (amdgpu_ring_alloc(ring, ndw)) {
|
||||
spin_unlock_irqrestore(&mes->ring_lock, flags);
|
||||
amdgpu_device_wb_free(adev, fence_offset);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
|
||||
api_status->api_completion_fence_addr = mes->ring.fence_drv.gpu_addr;
|
||||
api_status->api_completion_fence_value = ++mes->ring.fence_drv.sync_seq;
|
||||
api_status->api_completion_fence_addr = fence_gpu_addr;
|
||||
api_status->api_completion_fence_value = 1;
|
||||
|
||||
amdgpu_ring_write_multiple(ring, pkt, ndw);
|
||||
amdgpu_ring_commit(ring);
|
||||
spin_unlock_irqrestore(&mes->ring_lock, flags);
|
||||
|
||||
DRM_DEBUG("MES msg=%d was emitted\n", x_pkt->header.opcode);
|
||||
op_str = mes_v11_0_get_op_string(x_pkt);
|
||||
misc_op_str = mes_v11_0_get_misc_op_string(x_pkt);
|
||||
|
||||
r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
|
||||
timeout);
|
||||
if (misc_op_str)
|
||||
dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str);
|
||||
else if (op_str)
|
||||
dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str);
|
||||
else
|
||||
dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode);
|
||||
|
||||
r = amdgpu_mes_fence_wait_polling(fence_ptr, (u64)1, timeout);
|
||||
amdgpu_device_wb_free(adev, fence_offset);
|
||||
if (r < 1) {
|
||||
DRM_ERROR("MES failed to response msg=%d\n",
|
||||
x_pkt->header.opcode);
|
||||
|
||||
if (misc_op_str)
|
||||
dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n",
|
||||
op_str, misc_op_str);
|
||||
else if (op_str)
|
||||
dev_err(adev->dev, "MES failed to respond to msg=%s\n",
|
||||
op_str);
|
||||
else
|
||||
dev_err(adev->dev, "MES failed to respond to msg=%d\n",
|
||||
x_pkt->header.opcode);
|
||||
|
||||
while (halt_if_hws_hang)
|
||||
schedule();
|
||||
@@ -422,6 +505,36 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
|
||||
offsetof(union MESAPI_SET_HW_RESOURCES, api_status));
|
||||
}
|
||||
|
||||
static int mes_v11_0_set_hw_resources_1(struct amdgpu_mes *mes)
|
||||
{
|
||||
int size = 128 * PAGE_SIZE;
|
||||
int ret = 0;
|
||||
struct amdgpu_device *adev = mes->adev;
|
||||
union MESAPI_SET_HW_RESOURCES_1 mes_set_hw_res_pkt;
|
||||
memset(&mes_set_hw_res_pkt, 0, sizeof(mes_set_hw_res_pkt));
|
||||
|
||||
mes_set_hw_res_pkt.header.type = MES_API_TYPE_SCHEDULER;
|
||||
mes_set_hw_res_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC_1;
|
||||
mes_set_hw_res_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
|
||||
mes_set_hw_res_pkt.enable_mes_info_ctx = 1;
|
||||
|
||||
ret = amdgpu_bo_create_kernel(adev, size, PAGE_SIZE,
|
||||
AMDGPU_GEM_DOMAIN_VRAM,
|
||||
&mes->resource_1,
|
||||
&mes->resource_1_gpu_addr,
|
||||
&mes->resource_1_addr);
|
||||
if (ret) {
|
||||
dev_err(adev->dev, "(%d) failed to create mes resource_1 bo\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
mes_set_hw_res_pkt.mes_info_ctx_mc_addr = mes->resource_1_gpu_addr;
|
||||
mes_set_hw_res_pkt.mes_info_ctx_size = mes->resource_1->tbo.base.size;
|
||||
return mes_v11_0_submit_pkt_and_poll_completion(mes,
|
||||
&mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
|
||||
offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status));
|
||||
}
|
||||
|
||||
static const struct amdgpu_mes_funcs mes_v11_0_funcs = {
|
||||
.add_hw_queue = mes_v11_0_add_hw_queue,
|
||||
.remove_hw_queue = mes_v11_0_remove_hw_queue,
|
||||
@@ -1203,6 +1316,14 @@ static int mes_v11_0_hw_init(void *handle)
|
||||
if (r)
|
||||
goto failure;
|
||||
|
||||
if (amdgpu_sriov_is_mes_info_enable(adev)) {
|
||||
r = mes_v11_0_set_hw_resources_1(&adev->mes);
|
||||
if (r) {
|
||||
DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r);
|
||||
goto failure;
|
||||
}
|
||||
}
|
||||
|
||||
r = mes_v11_0_query_sched_status(&adev->mes);
|
||||
if (r) {
|
||||
DRM_ERROR("MES is busy\n");
|
||||
@@ -1226,6 +1347,11 @@ failure:
|
||||
|
||||
static int mes_v11_0_hw_fini(void *handle)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||
if (amdgpu_sriov_is_mes_info_enable(adev)) {
|
||||
amdgpu_bo_free_kernel(&adev->mes.resource_1, &adev->mes.resource_1_gpu_addr,
|
||||
&adev->mes.resource_1_addr);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1291,6 +1417,8 @@ static const struct amd_ip_funcs mes_v11_0_ip_funcs = {
|
||||
.hw_fini = mes_v11_0_hw_fini,
|
||||
.suspend = mes_v11_0_suspend,
|
||||
.resume = mes_v11_0_resume,
|
||||
.dump_ip_state = NULL,
|
||||
.print_ip_state = NULL,
|
||||
};
|
||||
|
||||
const struct amdgpu_ip_block_version mes_v11_0_ip_block = {
|
||||
|
||||
@@ -559,6 +559,20 @@ static void mmhub_v1_8_get_clockgating(struct amdgpu_device *adev, u64 *flags)
|
||||
|
||||
}
|
||||
|
||||
static bool mmhub_v1_8_query_utcl2_poison_status(struct amdgpu_device *adev,
|
||||
int hub_inst)
|
||||
{
|
||||
u32 fed, status;
|
||||
|
||||
status = RREG32_SOC15(MMHUB, hub_inst, regVM_L2_PROTECTION_FAULT_STATUS);
|
||||
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
|
||||
/* reset page fault status */
|
||||
WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst,
|
||||
regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
|
||||
|
||||
return fed;
|
||||
}
|
||||
|
||||
const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
|
||||
.get_fb_location = mmhub_v1_8_get_fb_location,
|
||||
.init = mmhub_v1_8_init,
|
||||
@@ -568,6 +582,7 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
|
||||
.setup_vm_pt_regs = mmhub_v1_8_setup_vm_pt_regs,
|
||||
.set_clockgating = mmhub_v1_8_set_clockgating,
|
||||
.get_clockgating = mmhub_v1_8_get_clockgating,
|
||||
.query_utcl2_poison_status = mmhub_v1_8_query_utcl2_poison_status,
|
||||
};
|
||||
|
||||
static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = {
|
||||
@@ -706,28 +721,32 @@ static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = {
|
||||
.reset_ras_error_count = mmhub_v1_8_reset_ras_error_count,
|
||||
};
|
||||
|
||||
static int mmhub_v1_8_aca_bank_generate_report(struct aca_handle *handle,
|
||||
struct aca_bank *bank, enum aca_error_type type,
|
||||
struct aca_bank_report *report, void *data)
|
||||
static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
u64 status, misc0;
|
||||
struct aca_bank_info info;
|
||||
u64 misc0;
|
||||
int ret;
|
||||
|
||||
status = bank->regs[ACA_REG_IDX_STATUS];
|
||||
if ((type == ACA_ERROR_TYPE_UE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
|
||||
(type == ACA_ERROR_TYPE_CE &&
|
||||
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
|
||||
ret = aca_bank_info_decode(bank, &info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = aca_bank_info_decode(bank, &report->info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
|
||||
misc0 = bank->regs[ACA_REG_IDX_MISC0];
|
||||
switch (type) {
|
||||
case ACA_SMU_TYPE_UE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
|
||||
1ULL);
|
||||
break;
|
||||
case ACA_SMU_TYPE_CE:
|
||||
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE,
|
||||
ACA_REG__MISC0__ERRCNT(misc0));
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* reference to smu driver if header file */
|
||||
@@ -741,7 +760,7 @@ static int mmhub_v1_8_err_codes[] = {
|
||||
};
|
||||
|
||||
static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
|
||||
enum aca_error_type type, void *data)
|
||||
enum aca_smu_type type, void *data)
|
||||
{
|
||||
u32 instlo;
|
||||
|
||||
@@ -760,7 +779,7 @@ static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_b
|
||||
}
|
||||
|
||||
static const struct aca_bank_ops mmhub_v1_8_aca_bank_ops = {
|
||||
.aca_bank_generate_report = mmhub_v1_8_aca_bank_generate_report,
|
||||
.aca_bank_parser = mmhub_v1_8_aca_bank_parser,
|
||||
.aca_bank_is_valid = mmhub_v1_8_aca_bank_is_valid,
|
||||
};
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user