Pull drm updates from Dave Airlie:
 "This is the main pull request for the drm subsystems for 6.10.

  In drivers the main thing is a new driver for ARM Mali firmware based
  GPUs, otherwise there are a lot of changes to amdgpu/xe/i915/msm and
  scattered changes to everything else.

  In the core a bunch of headers and Kconfig was refactored, along with
  the addition of a new panic handler which is meant to provide a user
  friendly message when a panic happens and graphical display is
  enabled.

  New drivers:
   - panthor: ARM Mali/Immortalis CSF-based GPU driver

  Core:
   - add a CONFIG_DRM_WERROR option
   - make more headers self-contained
   - grab resv lock in pin/unpin
   - fix vmap resv locking
   - EDID/eDP panel matching
   - Kconfig cleanups
   - DT sound bindings
   - Add SIZE_HINTS property for cursor planes
   - Add struct drm_edid_product_id and helpers.
   - Use drm device based logging in more drm functions.
   - drop seq_file.h from a bunch of places
   - use drm_edid driver conversions

  dp:
   - DP Tunnel documentation
   - MST read sideband cap
   - Adaptive sync SDP prep work

  ttm:
   - improve placement for TTM BOs in idle/busy handling

  panic:
   - Fixes for drm-panic, and option to test it.
   - Add drm panic to simpledrm, mgag200, imx, ast

  bridge:
   - improve init ordering
   - adv7511: allow GPIO pin sharing
   - tc358775: add tc358675 support

  panel:
   - AUO B120XAN01.0
   - Samsung s6e3fa7
   - BOE NT116WHM-N44
   - CMN N116BCA-EA1,
   - CrystalClear CMT430B19N00
   - Startek KD050HDFIA020-C020A
   - powertip PH128800T006-ZHC01
   - Innolux G121X1-L03
   - LG sw43408
   - Khadas TS050 V2
   - EDO RM69380 OLED
   - CSOT MNB601LS1-1

  amdgpu:
   - HDCP/ODM/RAS fixes
   - Devcoredump improvements
   - Expose VCN activity via sysfs
   - SMY 13.0.x updates
   - Enable fast updates on DCN 3.1.4
   - Add dclk and vclk reporting on additional devices
   - Add ACA RAS infrastructure
   - Implement TLB flush fence
   - EEPROM handling fixes
   - SMUIO 14.0.2 support
   - SMU 14.0.1 Updates
   - SMU 14.0.2 support
   - Sync page table freeing with TLB flushes
   - DML2 refactor
   - DC debug improvements
   - DCN 3.5.x Updates
   - GPU reset fixes
   - HDP fix for second GFX pipe on GC 10.x
   - Enable secondary GFX pipe on GC 10.3
   - Refactor and clean up BACO/BOCO/BAMACO handling
   - Remove invalid TTM resource start check
   - UAF fix in VA IOCTL
   - GPUVM page fault redirection to secondary IH rings for IH 6.x
   - Initial support for mapping kernel queues via MES
   - Fix VRAM memory accounting

  amdkfd:
   - MQD handling cleanup
   - Preemption handling fixes for XCDs
   - TLB flush fix for GC 9.4.2
   - Properly clean up workqueue during module unload
   - Fix memory leak process create failure
   - Range check CP bad op exception targets to avoid reporting invalid exceptions to userspace
   - Fix eviction fence handling
   - Fix leak in GPU memory allocation failure case
   - DMABuf import handling fix
   - Enable SQ watchpoint for gfx10

  i915:
   - Adding new DG2 PCI ID
   - add context hints for GT frequency
   - enable only one CCS for compute workloads
   - new workarounds
   - Fix UAF on destroy against retire race and remove two earlier partial fixes
   - Limit the reserved VM space to only the platforms that need it
   - Fix gt reset with GuC submission is disable
   - Add and use gt_to_guc() wrapper

  i915/xe display:
   - Lunar Lake display enabling, including cdclk and other refactors
   - BIOS/VBT/opregion related refactor
   - Digital port related refactor/clean-up
   - Fix 2s boot time regression on DP panel replay init
   - Remove duplication on audio enable/disable on SDVO and g4x+ DP
   - Disable AuxCCS framebuffers if built for Xe
   - Make crtc disable more atomic
   - Increase DP idle pattern wait timeout to 2ms
   - Start using container_of_const() for some extra const safety
   - Fix Jasper Lake boot freeze
   - Enable MST mode for 128b/132b single-stream sideband
   - Enable Adaptive Sync SDP Support for DP
   - Fix MTL supported DP rates - removal of UHBR13.5
   - PLL refactoring
   - Limit eDP MSO pipe only for display version 20
   - More display refactor towards independence from i915 dev_priv
   - Convert i915/xe fbdev to DRM client
   - More initial work to make display code more independent from i915

  xe:
   - improved error capture
   - clean up some uAPI leftovers
   - devcoredump update
   - Add BMG mocs table
   - Handle GSCCS ER interrupt
   - Implement xe2- and GuC workarounds
   - struct xe_device cleanup
   - Hwmon updates
   - Add LRC parsing for more GPU instruction
   - Increase VM_BIND number of per-ioctl Ops
   - drm/xe: Add XE_BO_GGTT_INVALIDATE flag
   - Initial development for SR-IOV support
   - Add new PCI IDs to DG2 platform
   - Move userptr over to start using hmm_range_fault

  msm:
   - Switched to generating register header files during build process
     instead of shipping pre-generated headers
   - Merged DPU and MDP4 format databases.
   - DP:
     - Stop using compat string to distinguish DP and eDP cases
     - Added support for X Elite platform (X1E80100)
     - Reworked DP aux/audio support
     - Added SM6350 DP to the bindings
   - GPU:
     - a7xx perfcntr reg fixes
     - MAINTAINERS updates
     - a750 devcoredump support

  radeon:
   - Silence UBSAN warnings related to flexible arrays

  nouveau:
   - move some uAPI objects to uapi headers

  omapdrm:
   - console fix

  ast:
   - add i2c polling

  qaic:
   - add debugfs entries

  exynos:
   - fix platform_driver .owner
   - drop cleanup code

  mediatek:
   - Use devm_platform_get_and_ioremap_resource() in mtk_hdmi_ddc_probe()
   - Add GAMMA 12-bit LUT support for MT8188
   - Rename mtk_drm_* to mtk_*
   - Drop driver owner initialization
   - Correct calculation formula of PHY Timing"

* tag 'drm-next-2024-05-15' of https://gitlab.freedesktop.org/drm/kernel: (1477 commits)
  drm/xe/ads: Use flexible-array
  drm/xe: Use ordered WQ for G2H handler
  drm/msm/gen_header: allow skipping the validation
  drm/msm/a6xx: Cleanup indexed regs const'ness
  drm/msm: Add devcoredump support for a750
  drm/msm: Adjust a7xx GBIF debugbus dumping
  drm/msm: Update a6xx registers XML
  drm/msm: Fix imported a750 snapshot header for upstream
  drm/msm: Import a750 snapshot registers from kgsl
  MAINTAINERS: Add Konrad Dybcio as a reviewer for the Adreno driver
  MAINTAINERS: Add a separate entry for Qualcomm Adreno GPU drivers
  drm/msm/a6xx: Avoid a nullptr dereference when speedbin setting fails
  drm/msm/adreno: fix CP cycles stat retrieval on a7xx
  drm/msm/a7xx: allow writing to CP_BV counter selection registers
  drm: zynqmp_dpsub: Always register bridge
  Revert "drm/bridge: ti-sn65dsi83: Fix enable error path"
  drm/fb_dma: Add checks in drm_fb_dma_get_scanout_buffer()
  drm/fbdev-generic: Do not set physical framebuffer address
  drm/panthor: Fix the FW reset logic
  drm/panthor: Make sure we handle 'unknown group state' case properly
  ...
This commit is contained in:
Linus Torvalds
2024-05-15 09:43:42 -07:00
1373 changed files with 77527 additions and 55665 deletions

View File

@@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \
atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \
atombios_encoders.o amdgpu_sa.o atombios_i2c.o \
amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \
amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \
amdgpu_ib.o amdgpu_pll.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
@@ -80,7 +81,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o
amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
@@ -247,7 +248,8 @@ amdgpu-y += \
smuio_v11_0_6.o \
smuio_v13_0.o \
smuio_v13_0_3.o \
smuio_v13_0_6.o
smuio_v13_0_6.o \
smuio_v14_0_2.o
# add reset block
amdgpu-y += \

View File

@@ -97,7 +97,7 @@ static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
}
return r;
return 0;
}
static int

View File

@@ -139,6 +139,14 @@ enum amdgpu_ss {
AMDGPU_SS_DRV_UNLOAD
};
struct amdgpu_hwip_reg_entry {
u32 hwip;
u32 inst;
u32 seg;
u32 reg_offset;
const char *reg_name;
};
struct amdgpu_watchdog_timer {
bool timeout_fatal_disable;
uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
@@ -494,6 +502,7 @@ struct amdgpu_wb {
uint64_t gpu_addr;
u32 num_wb; /* Number of wb slots actually reserved for amdgpu. */
unsigned long used[DIV_ROUND_UP(AMDGPU_MAX_WB, BITS_PER_LONG)];
spinlock_t lock;
};
int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb);
@@ -606,7 +615,7 @@ struct amdgpu_asic_funcs {
/* PCIe replay counter */
uint64_t (*get_pcie_replay_count)(struct amdgpu_device *adev);
/* device supports BACO */
bool (*supports_baco)(struct amdgpu_device *adev);
int (*supports_baco)(struct amdgpu_device *adev);
/* pre asic_init quirks */
void (*pre_asic_init)(struct amdgpu_device *adev);
/* enter/exit umd stable pstate */
@@ -1408,7 +1417,8 @@ bool amdgpu_device_supports_atpx(struct drm_device *dev);
bool amdgpu_device_supports_px(struct drm_device *dev);
bool amdgpu_device_supports_boco(struct drm_device *dev);
bool amdgpu_device_supports_smart_shift(struct drm_device *dev);
bool amdgpu_device_supports_baco(struct drm_device *dev);
int amdgpu_device_supports_baco(struct drm_device *dev);
void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev);
bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
struct amdgpu_device *peer_adev);
int amdgpu_device_baco_enter(struct drm_device *dev);

View File

@@ -28,7 +28,7 @@
#define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype}
typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, void *data);
typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
struct aca_banks {
int nr_banks;
@@ -86,7 +86,7 @@ static void aca_banks_release(struct aca_banks *banks)
}
}
static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_error_type type, u32 *count)
static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count)
{
struct amdgpu_aca *aca = &adev->aca;
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
@@ -116,20 +116,22 @@ static struct aca_regs_dump {
{"CONTROL_MASK", ACA_REG_IDX_CTL_MASK},
};
static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank)
static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank,
struct ras_query_context *qctx)
{
u64 event_id = qctx ? qctx->event_id : 0ULL;
int i;
dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n");
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
/* plus 1 for output format, e.g: ACA[08/08]: xxxx */
for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
dev_info(adev->dev, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
}
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_error_type type,
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
int start, int count,
struct aca_banks *banks)
struct aca_banks *banks, struct ras_query_context *qctx)
{
struct amdgpu_aca *aca = &adev->aca;
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
@@ -143,13 +145,12 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro
return -EOPNOTSUPP;
switch (type) {
case ACA_ERROR_TYPE_UE:
case ACA_SMU_TYPE_UE:
max_count = smu_funcs->max_ue_bank_count;
break;
case ACA_ERROR_TYPE_CE:
case ACA_SMU_TYPE_CE:
max_count = smu_funcs->max_ce_bank_count;
break;
case ACA_ERROR_TYPE_DEFERRED:
default:
return -EINVAL;
}
@@ -164,7 +165,9 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro
if (ret)
return ret;
aca_smu_bank_dump(adev, i, count, &bank);
bank.type = type;
aca_smu_bank_dump(adev, i, count, &bank, qctx);
ret = aca_banks_add_bank(banks, &bank);
if (ret)
@@ -195,7 +198,7 @@ static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type t
return hwip->hwid == hwid && hwip->mcatype == mcatype;
}
static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type)
static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
{
const struct aca_bank_ops *bank_ops = handle->bank_ops;
@@ -273,59 +276,49 @@ static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_
return new_bank_error(aerr, info);
}
static int aca_log_errors(struct aca_handle *handle, enum aca_error_type type,
struct aca_bank_report *report)
int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info,
enum aca_error_type type, u64 count)
{
struct aca_error_cache *error_cache = &handle->error_cache;
struct aca_bank_error *bank_error;
struct aca_error *aerr;
if (!handle || !report)
if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT)
return -EINVAL;
if (!report->count[type])
if (!count)
return 0;
aerr = &error_cache->errors[type];
bank_error = get_bank_error(aerr, &report->info);
bank_error = get_bank_error(aerr, info);
if (!bank_error)
return -ENOMEM;
bank_error->count[type] += report->count[type];
bank_error->count += count;
return 0;
}
static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank,
enum aca_error_type type, struct aca_bank_report *report)
static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
{
const struct aca_bank_ops *bank_ops = handle->bank_ops;
if (!bank || !report)
if (!bank)
return -EINVAL;
if (!bank_ops->aca_bank_generate_report)
if (!bank_ops->aca_bank_parser)
return -EOPNOTSUPP;
memset(report, 0, sizeof(*report));
return bank_ops->aca_bank_generate_report(handle, bank, type,
report, handle->data);
return bank_ops->aca_bank_parser(handle, bank, type,
handle->data);
}
static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank,
enum aca_error_type type, void *data)
enum aca_smu_type type, void *data)
{
struct aca_bank_report report;
int ret;
ret = aca_generate_bank_report(handle, bank, type, &report);
if (ret)
return ret;
if (!report.count[type])
return 0;
ret = aca_log_errors(handle, type, &report);
ret = aca_bank_parser(handle, bank, type);
if (ret)
return ret;
@@ -333,7 +326,7 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank
}
static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank,
enum aca_error_type type, bank_handler_t handler, void *data)
enum aca_smu_type type, bank_handler_t handler, void *data)
{
struct aca_handle *handle;
int ret;
@@ -354,7 +347,7 @@ static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *ba
}
static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks,
enum aca_error_type type, bank_handler_t handler, void *data)
enum aca_smu_type type, bank_handler_t handler, void *data)
{
struct aca_bank_node *node;
struct aca_bank *bank;
@@ -378,8 +371,28 @@ static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *
return 0;
}
static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type,
bank_handler_t handler, void *data)
static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type)
{
struct amdgpu_aca *aca = &adev->aca;
bool ret = true;
/*
* Because the UE Valid MCA count will only be cleared after reset,
* in order to avoid repeated counting of the error count,
* the aca bank is only updated once during the gpu recovery stage.
*/
if (type == ACA_SMU_TYPE_UE) {
if (amdgpu_ras_intr_triggered())
ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0;
else
atomic_set(&aca->ue_update_flag, 0);
}
return ret;
}
static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type,
bank_handler_t handler, struct ras_query_context *qctx, void *data)
{
struct amdgpu_aca *aca = &adev->aca;
struct aca_banks banks;
@@ -389,9 +402,8 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type
if (list_empty(&aca->mgr.list))
return 0;
/* NOTE: pmfw is only support UE and CE */
if (type == ACA_ERROR_TYPE_DEFERRED)
type = ACA_ERROR_TYPE_CE;
if (!aca_bank_should_update(adev, type))
return 0;
ret = aca_smu_get_valid_aca_count(adev, type, &count);
if (ret)
@@ -402,7 +414,7 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type
aca_banks_init(&banks);
ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks);
ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);
if (ret)
goto err_release_banks;
@@ -431,7 +443,7 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er
if (type >= ACA_ERROR_TYPE_COUNT)
return -EINVAL;
count = bank_error->count[type];
count = bank_error->count;
if (!count)
return 0;
@@ -447,6 +459,8 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, count);
break;
case ACA_ERROR_TYPE_DEFERRED:
amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, count);
break;
default:
break;
}
@@ -477,12 +491,25 @@ out_unlock:
}
static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type,
struct ras_err_data *err_data)
struct ras_err_data *err_data, struct ras_query_context *qctx)
{
enum aca_smu_type smu_type;
int ret;
switch (type) {
case ACA_ERROR_TYPE_UE:
smu_type = ACA_SMU_TYPE_UE;
break;
case ACA_ERROR_TYPE_CE:
case ACA_ERROR_TYPE_DEFERRED:
smu_type = ACA_SMU_TYPE_CE;
break;
default:
return -EINVAL;
}
/* udpate aca bank to aca source error_cache first */
ret = aca_banks_update(adev, type, handler_aca_log_bank_error, NULL);
ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);
if (ret)
return ret;
@@ -498,10 +525,9 @@ static bool aca_handle_is_valid(struct aca_handle *handle)
}
int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle,
enum aca_error_type type, void *data)
enum aca_error_type type, struct ras_err_data *err_data,
struct ras_query_context *qctx)
{
struct ras_err_data *err_data = (struct ras_err_data *)data;
if (!handle || !err_data)
return -EINVAL;
@@ -511,7 +537,7 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *han
if (!(BIT(type) & handle->mask))
return 0;
return __aca_get_error_data(adev, handle, type, err_data);
return __aca_get_error_data(adev, handle, type, err_data, qctx);
}
static void aca_error_init(struct aca_error *aerr, enum aca_error_type type)
@@ -668,6 +694,8 @@ int amdgpu_aca_init(struct amdgpu_device *adev)
struct amdgpu_aca *aca = &adev->aca;
int ret;
atomic_set(&aca->ue_update_flag, 0);
ret = aca_manager_init(&aca->mgr);
if (ret)
return ret;
@@ -680,6 +708,8 @@ void amdgpu_aca_fini(struct amdgpu_device *adev)
struct amdgpu_aca *aca = &adev->aca;
aca_manager_fini(&aca->mgr);
atomic_set(&aca->ue_update_flag, 0);
}
int amdgpu_aca_reset(struct amdgpu_device *adev)
@@ -723,23 +753,13 @@ int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info)
static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank)
{
int error_code;
struct amdgpu_aca *aca = &adev->aca;
const struct aca_smu_funcs *smu_funcs = aca->smu_funcs;
switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
case IP_VERSION(13, 0, 6):
if (!(adev->flags & AMD_IS_APU) && adev->pm.fw_version >= 0x00555600) {
error_code = ACA_REG__SYND__ERRORINFORMATION(bank->regs[ACA_REG_IDX_SYND]);
return error_code & 0xff;
}
break;
default:
break;
}
if (!smu_funcs || !smu_funcs->parse_error_code)
return -EOPNOTSUPP;
/* NOTE: the true error code is encoded in status.errorcode[0:7] */
error_code = ACA_REG__STATUS__ERRORCODE(bank->regs[ACA_REG_IDX_STATUS]);
return error_code & 0xff;
return smu_funcs->parse_error_code(adev, bank);
}
int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size)
@@ -750,6 +770,9 @@ int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank
return -EINVAL;
error_code = aca_bank_get_error_code(adev, bank);
if (error_code < 0)
return error_code;
for (i = 0; i < size; i++) {
if (err_codes[i] == error_code)
return 0;
@@ -784,7 +807,7 @@ static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val)
return 0;
}
static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_error_type type, int idx)
static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx)
{
struct aca_bank_info info;
int i, ret;
@@ -793,7 +816,7 @@ static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_e
if (ret)
return;
seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_ERROR_TYPE_UE ? "UE" : "CE");
seq_printf(m, "aca entry[%d].type: %s\n", idx, type == ACA_SMU_TYPE_UE ? "UE" : "CE");
seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",
idx, info.socket_id, info.die_id, info.hwid, info.mcatype);
@@ -807,7 +830,7 @@ struct aca_dump_context {
};
static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank,
enum aca_error_type type, void *data)
enum aca_smu_type type, void *data)
{
struct aca_dump_context *ctx = (struct aca_dump_context *)data;
@@ -816,7 +839,7 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban
return handler_aca_log_bank_error(handle, bank, type, NULL);
}
static int aca_dump_show(struct seq_file *m, enum aca_error_type type)
static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)
{
struct amdgpu_device *adev = (struct amdgpu_device *)m->private;
struct aca_dump_context context = {
@@ -824,12 +847,12 @@ static int aca_dump_show(struct seq_file *m, enum aca_error_type type)
.idx = 0,
};
return aca_banks_update(adev, type, handler_aca_bank_dump, (void *)&context);
return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);
}
static int aca_dump_ce_show(struct seq_file *m, void *unused)
{
return aca_dump_show(m, ACA_ERROR_TYPE_CE);
return aca_dump_show(m, ACA_SMU_TYPE_CE);
}
static int aca_dump_ce_open(struct inode *inode, struct file *file)
@@ -847,7 +870,7 @@ static const struct file_operations aca_ce_dump_debug_fops = {
static int aca_dump_ue_show(struct seq_file *m, void *unused)
{
return aca_dump_show(m, ACA_ERROR_TYPE_UE);
return aca_dump_show(m, ACA_SMU_TYPE_UE);
}
static int aca_dump_ue_open(struct inode *inode, struct file *file)

View File

@@ -26,6 +26,9 @@
#include <linux/list.h>
struct ras_err_data;
struct ras_query_context;
#define ACA_MAX_REGS_COUNT (16)
#define ACA_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l)
@@ -99,7 +102,14 @@ enum aca_error_type {
ACA_ERROR_TYPE_COUNT
};
enum aca_smu_type {
ACA_SMU_TYPE_UE = 0,
ACA_SMU_TYPE_CE,
ACA_SMU_TYPE_COUNT,
};
struct aca_bank {
enum aca_smu_type type;
u64 regs[ACA_MAX_REGS_COUNT];
};
@@ -115,15 +125,10 @@ struct aca_bank_info {
int mcatype;
};
struct aca_bank_report {
struct aca_bank_info info;
u64 count[ACA_ERROR_TYPE_COUNT];
};
struct aca_bank_error {
struct list_head node;
struct aca_bank_info info;
u64 count[ACA_ERROR_TYPE_COUNT];
u64 count;
};
struct aca_error {
@@ -157,9 +162,8 @@ struct aca_handle {
};
struct aca_bank_ops {
int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
struct aca_bank_report *report, void *data);
bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
int (*aca_bank_parser)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);
bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,
void *data);
};
@@ -167,13 +171,15 @@ struct aca_smu_funcs {
int max_ue_bank_count;
int max_ce_bank_count;
int (*set_debug_mode)(struct amdgpu_device *adev, bool enable);
int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_error_type type, u32 *count);
int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_error_type type, int idx, struct aca_bank *bank);
int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count);
int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank);
int (*parse_error_code)(struct amdgpu_device *adev, struct aca_bank *bank);
};
struct amdgpu_aca {
struct aca_handle_manager mgr;
const struct aca_smu_funcs *smu_funcs;
atomic_t ue_update_flag;
bool is_enabled;
};
@@ -196,7 +202,10 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,
const char *name, const struct aca_info *aca_info, void *data);
void amdgpu_aca_remove_handle(struct aca_handle *handle);
int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle,
enum aca_error_type type, void *data);
enum aca_error_type type, struct ras_err_data *err_data,
struct ras_query_context *qctx);
int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en);
void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root);
int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info,
enum aca_error_type type, u64 count);
#endif

View File

@@ -637,6 +637,8 @@ static const struct amd_ip_funcs acp_ip_funcs = {
.soft_reset = acp_soft_reset,
.set_clockgating_state = acp_set_clockgating_state,
.set_powergating_state = acp_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
const struct amdgpu_ip_block_version acp_ip_block = {

View File

@@ -747,10 +747,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
return amdgpu_ras_get_fed_status(adev);
}
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset)
void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset)
{
amdgpu_umc_poison_handler(adev, block, reset);
amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn, data, reset);
}
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset)
{
amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
}
int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
@@ -769,12 +776,20 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
return 0;
}
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
int hub_inst, int hub_type)
{
if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
return adev->gfx.ras->query_utcl2_poison_status(adev);
else
return false;
if (!hub_type) {
if (adev->gfxhub.funcs->query_utcl2_poison_status)
return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst);
else
return false;
} else {
if (adev->mmhub.funcs->query_utcl2_poison_status)
return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst);
else
return false;
}
}
int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)

View File

@@ -336,12 +336,18 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset);
enum amdgpu_ras_block block, uint32_t reset);
void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset);
bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
void amdgpu_amdkfd_block_mmu_notifications(void *p);
int amdgpu_amdkfd_criu_resume(void *p);
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev);
bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev,
int hub_inst, int hub_type);
int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id);
void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,

View File

@@ -881,6 +881,7 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
}
#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
#define SQ_WATCH_STRIDE (mmSQ_WATCH1_ADDR_H - mmSQ_WATCH0_ADDR_H)
uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
uint64_t watch_address,
uint32_t watch_address_mask,
@@ -889,55 +890,93 @@ uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
uint32_t debug_vmid,
uint32_t inst)
{
/* SQ_WATCH?_ADDR_* and TCP_WATCH?_ADDR_* are programmed with the
* same values.
*/
uint32_t watch_address_high;
uint32_t watch_address_low;
uint32_t watch_address_cntl;
watch_address_cntl = 0;
uint32_t tcp_watch_address_cntl;
uint32_t sq_watch_address_cntl;
watch_address_low = lower_32_bits(watch_address);
watch_address_high = upper_32_bits(watch_address) & 0xffff;
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = 0;
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
VMID,
debug_vmid);
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
MODE,
watch_mode);
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
MASK,
watch_address_mask >> 7);
sq_watch_address_cntl = 0;
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
VMID,
debug_vmid);
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
MODE,
watch_mode);
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
MASK,
watch_address_mask >> 6);
/* Turning off this watch point until we set all the registers */
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
VALID,
0);
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_cntl);
tcp_watch_address_cntl);
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
VALID,
0);
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
(watch_id * SQ_WATCH_STRIDE)),
sq_watch_address_cntl);
/* Program {TCP,SQ}_WATCH?_ADDR* */
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_high);
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_low);
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_H) +
(watch_id * SQ_WATCH_STRIDE)),
watch_address_high);
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_L) +
(watch_id * SQ_WATCH_STRIDE)),
watch_address_low);
/* Enable the watch point */
watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
VALID,
1);
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_cntl);
tcp_watch_address_cntl);
sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
SQ_WATCH0_CNTL,
VALID,
1);
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
(watch_id * SQ_WATCH_STRIDE)),
sq_watch_address_cntl);
return 0;
}
@@ -953,8 +992,14 @@ uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
(watch_id * TCP_WATCH_STRIDE)),
watch_address_cntl);
WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
(watch_id * SQ_WATCH_STRIDE)),
watch_address_cntl);
return 0;
}
#undef TCP_WATCH_STRIDE
#undef SQ_WATCH_STRIDE
/* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values

View File

@@ -34,6 +34,7 @@ union firmware_info {
struct atom_firmware_info_v3_2 v32;
struct atom_firmware_info_v3_3 v33;
struct atom_firmware_info_v3_4 v34;
struct atom_firmware_info_v3_5 v35;
};
/*
@@ -872,6 +873,10 @@ int amdgpu_atomfirmware_get_fw_reserved_fb_size(struct amdgpu_device *adev)
fw_reserved_fb_size =
(firmware_info->v34.fw_reserved_size_in_kb << 10);
break;
case 5:
fw_reserved_fb_size =
(firmware_info->v35.fw_reserved_size_in_kb << 10);
break;
default:
fw_reserved_fb_size = 0;
break;

View File

@@ -39,7 +39,7 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,
for (i = 0; i < n; i++) {
struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
r = amdgpu_copy_buffer(ring, saddr, daddr, size, NULL, &fence,
false, false, false);
false, false, 0);
if (r)
goto exit_do_move;
r = dma_fence_wait(fence, false);

View File

@@ -2065,12 +2065,13 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,
struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
char reg_offset[11];
uint32_t *new = NULL, *tmp = NULL;
int ret, i = 0, len = 0;
unsigned int len = 0;
int ret, i = 0;
do {
memset(reg_offset, 0, 11);
if (copy_from_user(reg_offset, buf + len,
min(10, ((int)size-len)))) {
min(10, (size-len)))) {
ret = -EFAULT;
goto error_free;
}

View File

@@ -0,0 +1,360 @@
// SPDX-License-Identifier: MIT
/*
* Copyright 2024 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <generated/utsrelease.h>
#include <linux/devcoredump.h>
#include "amdgpu_dev_coredump.h"
#include "atom.h"
#ifndef CONFIG_DEV_COREDUMP
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
struct amdgpu_reset_context *reset_context)
{
}
#else
const char *hw_ip_names[MAX_HWIP] = {
[GC_HWIP] = "GC",
[HDP_HWIP] = "HDP",
[SDMA0_HWIP] = "SDMA0",
[SDMA1_HWIP] = "SDMA1",
[SDMA2_HWIP] = "SDMA2",
[SDMA3_HWIP] = "SDMA3",
[SDMA4_HWIP] = "SDMA4",
[SDMA5_HWIP] = "SDMA5",
[SDMA6_HWIP] = "SDMA6",
[SDMA7_HWIP] = "SDMA7",
[LSDMA_HWIP] = "LSDMA",
[MMHUB_HWIP] = "MMHUB",
[ATHUB_HWIP] = "ATHUB",
[NBIO_HWIP] = "NBIO",
[MP0_HWIP] = "MP0",
[MP1_HWIP] = "MP1",
[UVD_HWIP] = "UVD/JPEG/VCN",
[VCN1_HWIP] = "VCN1",
[VCE_HWIP] = "VCE",
[VPE_HWIP] = "VPE",
[DF_HWIP] = "DF",
[DCE_HWIP] = "DCE",
[OSSSYS_HWIP] = "OSSSYS",
[SMUIO_HWIP] = "SMUIO",
[PWR_HWIP] = "PWR",
[NBIF_HWIP] = "NBIF",
[THM_HWIP] = "THM",
[CLK_HWIP] = "CLK",
[UMC_HWIP] = "UMC",
[RSMU_HWIP] = "RSMU",
[XGMI_HWIP] = "XGMI",
[DCI_HWIP] = "DCI",
[PCIE_HWIP] = "PCIE",
};
static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
struct drm_printer *p)
{
uint32_t version;
uint32_t feature;
uint8_t smu_program, smu_major, smu_minor, smu_debug;
struct atom_context *ctx = adev->mode_info.atom_context;
drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n",
adev->vce.fb_version, adev->vce.fw_version);
drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0,
adev->uvd.fw_version);
drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0,
adev->gmc.fw_version);
drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n",
adev->gfx.me_feature_version, adev->gfx.me_fw_version);
drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n",
adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version);
drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n",
adev->gfx.ce_feature_version, adev->gfx.ce_fw_version);
drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n",
adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version);
drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n",
adev->gfx.rlc_srlc_feature_version,
adev->gfx.rlc_srlc_fw_version);
drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n",
adev->gfx.rlc_srlg_feature_version,
adev->gfx.rlc_srlg_fw_version);
drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n",
adev->gfx.rlc_srls_feature_version,
adev->gfx.rlc_srls_fw_version);
drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n",
adev->gfx.rlcp_ucode_feature_version,
adev->gfx.rlcp_ucode_version);
drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n",
adev->gfx.rlcv_ucode_feature_version,
adev->gfx.rlcv_ucode_version);
drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n",
adev->gfx.mec_feature_version, adev->gfx.mec_fw_version);
if (adev->gfx.mec2_fw)
drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n",
adev->gfx.mec2_feature_version,
adev->gfx.mec2_fw_version);
drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0,
adev->gfx.imu_fw_version);
drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n",
adev->psp.sos.feature_version, adev->psp.sos.fw_version);
drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n",
adev->psp.asd_context.bin_desc.feature_version,
adev->psp.asd_context.bin_desc.fw_version);
drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n",
adev->psp.xgmi_context.context.bin_desc.feature_version,
adev->psp.xgmi_context.context.bin_desc.fw_version);
drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n",
adev->psp.ras_context.context.bin_desc.feature_version,
adev->psp.ras_context.context.bin_desc.fw_version);
drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n",
adev->psp.hdcp_context.context.bin_desc.feature_version,
adev->psp.hdcp_context.context.bin_desc.fw_version);
drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n",
adev->psp.dtm_context.context.bin_desc.feature_version,
adev->psp.dtm_context.context.bin_desc.fw_version);
drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n",
adev->psp.rap_context.context.bin_desc.feature_version,
adev->psp.rap_context.context.bin_desc.fw_version);
drm_printf(p,
"TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n",
adev->psp.securedisplay_context.context.bin_desc.feature_version,
adev->psp.securedisplay_context.context.bin_desc.fw_version);
/* SMC firmware */
version = adev->pm.fw_version;
smu_program = (version >> 24) & 0xff;
smu_major = (version >> 16) & 0xff;
smu_minor = (version >> 8) & 0xff;
smu_debug = (version >> 0) & 0xff;
drm_printf(p,
"SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n",
0, smu_program, version, smu_major, smu_minor, smu_debug);
/* SDMA firmware */
for (int i = 0; i < adev->sdma.num_instances; i++) {
drm_printf(p,
"SDMA%d feature version: %u, firmware version: 0x%08x\n",
i, adev->sdma.instance[i].feature_version,
adev->sdma.instance[i].fw_version);
}
drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0,
adev->vcn.fw_version);
drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0,
adev->dm.dmcu_fw_version);
drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0,
adev->dm.dmcub_fw_version);
drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n",
adev->psp.toc.feature_version, adev->psp.toc.fw_version);
version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
AMDGPU_MES_FEAT_VERSION_SHIFT;
drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n",
feature, version);
version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
AMDGPU_MES_FEAT_VERSION_SHIFT;
drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature,
version);
drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n",
adev->vpe.feature_version, adev->vpe.fw_version);
drm_printf(p, "\nVBIOS Information\n");
drm_printf(p, "vbios name : %s\n", ctx->name);
drm_printf(p, "vbios pn : %s\n", ctx->vbios_pn);
drm_printf(p, "vbios version : %d\n", ctx->version);
drm_printf(p, "vbios ver_str : %s\n", ctx->vbios_ver_str);
drm_printf(p, "vbios date : %s\n", ctx->date);
}
static ssize_t
amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
void *data, size_t datalen)
{
struct drm_printer p;
struct amdgpu_coredump_info *coredump = data;
struct drm_print_iterator iter;
struct amdgpu_vm_fault_info *fault_info;
int i, ver;
iter.data = buffer;
iter.offset = 0;
iter.start = offset;
iter.remain = count;
p = drm_coredump_printer(&iter);
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
coredump->reset_time.tv_nsec);
if (coredump->reset_task_info.pid)
drm_printf(&p, "process_name: %s PID: %d\n",
coredump->reset_task_info.process_name,
coredump->reset_task_info.pid);
/* GPU IP's information of the SOC */
drm_printf(&p, "\nIP Information\n");
drm_printf(&p, "SOC Family: %d\n", coredump->adev->family);
drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id);
drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
for (int i = 1; i < MAX_HWIP; i++) {
for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
ver = coredump->adev->ip_versions[i][j];
if (ver)
drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
hw_ip_names[i], i, j,
IP_VERSION_MAJ(ver),
IP_VERSION_MIN(ver),
IP_VERSION_REV(ver),
IP_VERSION_VARIANT(ver),
IP_VERSION_SUBREV(ver));
}
}
/* IP firmware information */
drm_printf(&p, "\nIP Firmwares\n");
amdgpu_devcoredump_fw_info(coredump->adev, &p);
if (coredump->ring) {
drm_printf(&p, "\nRing timed out details\n");
drm_printf(&p, "IP Type: %d Ring Name: %s\n",
coredump->ring->funcs->type,
coredump->ring->name);
}
/* Add page fault information */
fault_info = &coredump->adev->vm_manager.fault_info;
drm_printf(&p, "\n[%s] Page fault observed\n",
fault_info->vmhub ? "mmhub" : "gfxhub");
drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
/* dump the ip state for each ip */
drm_printf(&p, "IP Dump\n");
for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
if (coredump->adev->ip_blocks[i].version->funcs->print_ip_state) {
drm_printf(&p, "IP: %s\n",
coredump->adev->ip_blocks[i]
.version->funcs->name);
coredump->adev->ip_blocks[i]
.version->funcs->print_ip_state(
(void *)coredump->adev, &p);
drm_printf(&p, "\n");
}
}
/* Add ring buffer information */
drm_printf(&p, "Ring buffer information\n");
for (int i = 0; i < coredump->adev->num_rings; i++) {
int j = 0;
struct amdgpu_ring *ring = coredump->adev->rings[i];
drm_printf(&p, "ring name: %s\n", ring->name);
drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
amdgpu_ring_get_rptr(ring),
amdgpu_ring_get_wptr(ring),
ring->buf_mask);
drm_printf(&p, "Ring size in dwords: %d\n",
ring->ring_size / 4);
drm_printf(&p, "Ring contents\n");
drm_printf(&p, "Offset \t Value\n");
while (j < ring->ring_size) {
drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j / 4]);
j += 4;
}
}
if (coredump->reset_vram_lost)
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
if (coredump->adev->reset_info.num_regs) {
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
drm_printf(&p, "0x%08x: 0x%08x\n",
coredump->adev->reset_info.reset_dump_reg_list[i],
coredump->adev->reset_info.reset_dump_reg_value[i]);
}
return count - iter.remain;
}
static void amdgpu_devcoredump_free(void *data)
{
kfree(data);
}
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
struct amdgpu_reset_context *reset_context)
{
struct amdgpu_coredump_info *coredump;
struct drm_device *dev = adev_to_drm(adev);
struct amdgpu_job *job = reset_context->job;
struct drm_sched_job *s_job;
coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
if (!coredump) {
DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
return;
}
coredump->reset_vram_lost = vram_lost;
if (reset_context->job && reset_context->job->vm) {
struct amdgpu_task_info *ti;
struct amdgpu_vm *vm = reset_context->job->vm;
ti = amdgpu_vm_get_task_info_vm(vm);
if (ti) {
coredump->reset_task_info = *ti;
amdgpu_vm_put_task_info(ti);
}
}
if (job) {
s_job = &job->base;
coredump->ring = to_amdgpu_ring(s_job->sched);
}
coredump->adev = adev;
ktime_get_ts64(&coredump->reset_time);
dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
}
#endif

View File

@@ -0,0 +1,47 @@
/* SPDX-License-Identifier: MIT */
/*
* Copyright 2024 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef __AMDGPU_DEV_COREDUMP_H__
#define __AMDGPU_DEV_COREDUMP_H__
#include "amdgpu.h"
#include "amdgpu_reset.h"
#ifdef CONFIG_DEV_COREDUMP
#define AMDGPU_COREDUMP_VERSION "1"
struct amdgpu_coredump_info {
struct amdgpu_device *adev;
struct amdgpu_task_info reset_task_info;
struct timespec64 reset_time;
bool reset_vram_lost;
struct amdgpu_ring *ring;
};
#endif
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
struct amdgpu_reset_context *reset_context);
#endif

View File

@@ -74,6 +74,7 @@
#include "amdgpu_fru_eeprom.h"
#include "amdgpu_reset.h"
#include "amdgpu_virt.h"
#include "amdgpu_dev_coredump.h"
#include <linux/suspend.h>
#include <drm/task_barrier.h>
@@ -143,6 +144,8 @@ const char *amdgpu_asic_name[] = {
"LAST",
};
static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
/**
* DOC: pcie_replay_count
*
@@ -335,16 +338,93 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)
*
* @dev: drm_device pointer
*
* Returns true if the device supporte BACO,
* otherwise return false.
* Return:
* 1 if the device supporte BACO;
* 3 if the device support MACO (only works if BACO is supported)
* otherwise return 0.
*/
bool amdgpu_device_supports_baco(struct drm_device *dev)
int amdgpu_device_supports_baco(struct drm_device *dev)
{
struct amdgpu_device *adev = drm_to_adev(dev);
return amdgpu_asic_supports_baco(adev);
}
void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
{
struct drm_device *dev;
int bamaco_support;
dev = adev_to_drm(adev);
adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
bamaco_support = amdgpu_device_supports_baco(dev);
switch (amdgpu_runtime_pm) {
case 2:
if (bamaco_support & MACO_SUPPORT) {
adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
} else if (bamaco_support == BACO_SUPPORT) {
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
}
break;
case 1:
if (bamaco_support & BACO_SUPPORT) {
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
dev_info(adev->dev, "Forcing BACO for runtime pm\n");
}
break;
case -1:
case -2:
if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
dev_info(adev->dev, "Using ATPX for runtime pm\n");
} else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
dev_info(adev->dev, "Using BOCO for runtime pm\n");
} else {
if (!bamaco_support)
goto no_runtime_pm;
switch (adev->asic_type) {
case CHIP_VEGA20:
case CHIP_ARCTURUS:
/* BACO are not supported on vega20 and arctrus */
break;
case CHIP_VEGA10:
/* enable BACO as runpm mode if noretry=0 */
if (!adev->gmc.noretry)
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
break;
default:
/* enable BACO as runpm mode on CI+ */
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
break;
}
if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
if (bamaco_support & MACO_SUPPORT) {
adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
dev_info(adev->dev, "Using BAMACO for runtime pm\n");
} else {
dev_info(adev->dev, "Using BACO for runtime pm\n");
}
}
}
break;
case 0:
dev_info(adev->dev, "runtime pm is manually disabled\n");
break;
default:
break;
}
no_runtime_pm:
if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
dev_info(adev->dev, "Runtime PM not available\n");
}
/**
* amdgpu_device_supports_smart_shift - Is the device dGPU with
* smart shift support
@@ -1402,13 +1482,17 @@ static int amdgpu_device_wb_init(struct amdgpu_device *adev)
*/
int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
{
unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
unsigned long flags, offset;
spin_lock_irqsave(&adev->wb.lock, flags);
offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
if (offset < adev->wb.num_wb) {
__set_bit(offset, adev->wb.used);
spin_unlock_irqrestore(&adev->wb.lock, flags);
*wb = offset << 3; /* convert to dw offset */
return 0;
} else {
spin_unlock_irqrestore(&adev->wb.lock, flags);
return -EINVAL;
}
}
@@ -1423,9 +1507,13 @@ int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
*/
void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
{
unsigned long flags;
wb >>= 3;
spin_lock_irqsave(&adev->wb.lock, flags);
if (wb < adev->wb.num_wb)
__clear_bit(wb, adev->wb.used);
spin_unlock_irqrestore(&adev->wb.lock, flags);
}
/**
@@ -1455,7 +1543,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
/* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
DRM_WARN("System can't access extended configuration space,please check!!\n");
DRM_WARN("System can't access extended configuration space, please check!!\n");
/* skip if the bios has already enabled large BAR */
if (adev->gmc.real_vram_size &&
@@ -3981,6 +4069,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
spin_lock_init(&adev->se_cac_idx_lock);
spin_lock_init(&adev->audio_endpt_idx_lock);
spin_lock_init(&adev->mm_stats.lock);
spin_lock_init(&adev->wb.lock);
INIT_LIST_HEAD(&adev->shadow_list);
mutex_init(&adev->shadow_list_lock);
@@ -4069,6 +4158,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
/* Enable TMZ based on IP_VERSION */
amdgpu_gmc_tmz_set(adev);
if (amdgpu_sriov_vf(adev) &&
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
/* VF MMIO access (except mailbox range) from CPU
* will be blocked during sriov runtime
*/
adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
amdgpu_gmc_noretry_set(adev);
/* Need to get xgmi info early to decide the reset behavior*/
if (adev->gmc.xgmi.supported) {
@@ -4974,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
retry:
amdgpu_amdkfd_pre_reset(adev);
amdgpu_device_stop_pending_resets(adev);
if (from_hypervisor)
r = amdgpu_virt_request_full_gpu(adev, true);
else
r = amdgpu_virt_reset_gpu(adev);
if (r)
return r;
amdgpu_ras_set_fed(adev, false);
amdgpu_irq_gpu_reset_resume_helper(adev);
/* some sw clean up VF needs to do before recover */
@@ -5263,11 +5362,21 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
uint32_t i;
/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
amdgpu_reset_reg_dumps(tmp_adev);
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
amdgpu_reset_reg_dumps(tmp_adev);
/* Trigger ip dump before we reset the asic */
for (i = 0; i < tmp_adev->num_ip_blocks; i++)
if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
tmp_adev->ip_blocks[i].version->funcs
->dump_ip_state((void *)tmp_adev);
}
reset_context->reset_device_list = device_list_handle;
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
@@ -5340,7 +5449,8 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
amdgpu_coredump(tmp_adev, vram_lost, reset_context);
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
amdgpu_coredump(tmp_adev, vram_lost, reset_context);
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
@@ -5538,6 +5648,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
}
static int amdgpu_device_health_check(struct list_head *device_list_handle)
{
struct amdgpu_device *tmp_adev;
int ret = 0;
u32 status;
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
if (PCI_POSSIBLE_ERROR(status)) {
dev_err(tmp_adev->dev, "device lost from bus!");
ret = -ENODEV;
}
}
return ret;
}
/**
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
@@ -5609,6 +5736,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
device_list_handle = &device_list;
}
if (!amdgpu_sriov_vf(adev)) {
r = amdgpu_device_health_check(device_list_handle);
if (r)
goto end_reset;
}
/* We need to lock reset domain only once both for XGMI and single device */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
@@ -5691,11 +5824,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
tmp_adev->asic_reset_res = r;
}
/*
* Drop all pending non scheduler resets. Scheduler resets
* were already dropped during drm_sched_stop
*/
amdgpu_device_stop_pending_resets(tmp_adev);
if (!amdgpu_sriov_vf(tmp_adev))
/*
* Drop all pending non scheduler resets. Scheduler resets
* were already dropped during drm_sched_stop
*/
amdgpu_device_stop_pending_resets(tmp_adev);
}
/* Actual ASIC resets if needed.*/
@@ -5774,6 +5908,7 @@ skip_sched_resume:
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
end_reset:
if (hive) {
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);

View File

@@ -97,6 +97,7 @@
#include "smuio_v13_0.h"
#include "smuio_v13_0_3.h"
#include "smuio_v13_0_6.h"
#include "smuio_v14_0_2.h"
#include "vcn_v5_0_0.h"
#include "jpeg_v5_0_0.h"
@@ -245,6 +246,9 @@ static int amdgpu_discovery_read_binary_from_sysmem(struct amdgpu_device *adev,
return -ENOENT;
}
#define IP_DISCOVERY_V2 2
#define IP_DISCOVERY_V4 4
static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
uint8_t *binary)
{
@@ -259,14 +263,14 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
* wait for this to complete. Once the C2PMSG is updated, we can
* continue.
*/
if (dev_is_removable(&adev->pdev->dev)) {
for (i = 0; i < 1000; i++) {
msg = RREG32(mmMP0_SMN_C2PMSG_33);
if (msg & 0x80000000)
break;
msleep(1);
}
for (i = 0; i < 1000; i++) {
msg = RREG32(mmMP0_SMN_C2PMSG_33);
if (msg & 0x80000000)
break;
usleep_range(1000, 1100);
}
vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20;
if (vram_size) {
@@ -1897,6 +1901,8 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev)
break;
case IP_VERSION(14, 0, 0):
case IP_VERSION(14, 0, 1):
case IP_VERSION(14, 0, 2):
case IP_VERSION(14, 0, 3):
amdgpu_device_ip_block_add(adev, &smu_v14_0_ip_block);
break;
default:
@@ -2678,6 +2684,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
case IP_VERSION(14, 0, 1):
adev->smuio.funcs = &smuio_v13_0_6_funcs;
break;
case IP_VERSION(14, 0, 2):
adev->smuio.funcs = &smuio_v14_0_2_funcs;
break;
default:
break;
}

View File

@@ -925,7 +925,7 @@ module_param_named(freesync_video, amdgpu_freesync_vid_mode, uint, 0444);
* GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)
*/
MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco/bamaco)");
module_param_named(reset_method, amdgpu_reset_method, int, 0444);
module_param_named(reset_method, amdgpu_reset_method, int, 0644);
/**
* DOC: bad_page_threshold (int) Bad page threshold is specifies the
@@ -2481,6 +2481,7 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)
/* Use a common context, just need to make sure full reset is done */
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
r = amdgpu_do_asic_reset(&device_list, &reset_context);
if (r) {
@@ -2744,7 +2745,8 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev)
drm_dev->switch_power_state = DRM_SWITCH_POWER_DYNAMIC_OFF;
} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BOCO) {
/* nothing to do */
} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
} else if ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)) {
amdgpu_device_baco_enter(drm_dev);
}
@@ -2784,7 +2786,8 @@ static int amdgpu_pmops_runtime_resume(struct device *dev)
* PCI core handles it for _PR3.
*/
pci_set_master(pdev);
} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
} else if ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)) {
amdgpu_device_baco_exit(drm_dev);
}
ret = amdgpu_device_resume(drm_dev, false);

View File

@@ -1206,7 +1206,8 @@ void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev,
fw_size = le32_to_cpu(cp_hdr_v2_0->data_size_bytes);
break;
default:
break;
dev_err(adev->dev, "Invalid ucode id %u\n", ucode_id);
return;
}
if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {

View File

@@ -259,7 +259,6 @@ struct amdgpu_cu_info {
struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
@@ -434,6 +433,10 @@ struct amdgpu_gfx {
uint32_t num_xcc_per_xcp;
struct mutex partition_mutex;
bool mcbp; /* mid command buffer preemption */
/* IP reg dump */
uint32_t *ip_dump;
uint32_t reg_count;
};
struct amdgpu_gfx_ras_reg_entry {

View File

@@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs {
void (*mode2_save_regs)(struct amdgpu_device *adev);
void (*mode2_restore_regs)(struct amdgpu_device *adev);
void (*halt)(struct amdgpu_device *adev);
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
int xcc_id);
};
struct amdgpu_gfxhub {

View File

@@ -279,7 +279,7 @@ amdgpu_i2c_lookup(struct amdgpu_device *adev,
return NULL;
}
static void amdgpu_i2c_get_byte(struct amdgpu_i2c_chan *i2c_bus,
static int amdgpu_i2c_get_byte(struct amdgpu_i2c_chan *i2c_bus,
u8 slave_addr,
u8 addr,
u8 *val)
@@ -304,16 +304,18 @@ static void amdgpu_i2c_get_byte(struct amdgpu_i2c_chan *i2c_bus,
out_buf[0] = addr;
out_buf[1] = 0;
if (i2c_transfer(&i2c_bus->adapter, msgs, 2) == 2) {
*val = in_buf[0];
DRM_DEBUG("val = 0x%02x\n", *val);
} else {
DRM_DEBUG("i2c 0x%02x 0x%02x read failed\n",
addr, *val);
if (i2c_transfer(&i2c_bus->adapter, msgs, 2) != 2) {
DRM_DEBUG("i2c 0x%02x read failed\n", addr);
return -EIO;
}
*val = in_buf[0];
DRM_DEBUG("val = 0x%02x\n", *val);
return 0;
}
static void amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus,
static int amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus,
u8 slave_addr,
u8 addr,
u8 val)
@@ -329,9 +331,12 @@ static void amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus,
out_buf[0] = addr;
out_buf[1] = val;
if (i2c_transfer(&i2c_bus->adapter, &msg, 1) != 1)
DRM_DEBUG("i2c 0x%02x 0x%02x write failed\n",
addr, val);
if (i2c_transfer(&i2c_bus->adapter, &msg, 1) != 1) {
DRM_DEBUG("i2c 0x%02x 0x%02x write failed\n", addr, val);
return -EIO;
}
return 0;
}
/* ddc router switching */
@@ -346,16 +351,18 @@ amdgpu_i2c_router_select_ddc_port(const struct amdgpu_connector *amdgpu_connecto
if (!amdgpu_connector->router_bus)
return;
amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
amdgpu_connector->router.i2c_addr,
0x3, &val);
0x3, &val))
return;
val &= ~amdgpu_connector->router.ddc_mux_control_pin;
amdgpu_i2c_put_byte(amdgpu_connector->router_bus,
amdgpu_connector->router.i2c_addr,
0x3, val);
amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
amdgpu_connector->router.i2c_addr,
0x1, &val);
0x1, &val))
return;
val &= ~amdgpu_connector->router.ddc_mux_control_pin;
val |= amdgpu_connector->router.ddc_mux_state;
amdgpu_i2c_put_byte(amdgpu_connector->router_bus,
@@ -375,16 +382,18 @@ amdgpu_i2c_router_select_cd_port(const struct amdgpu_connector *amdgpu_connector
if (!amdgpu_connector->router_bus)
return;
amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
amdgpu_connector->router.i2c_addr,
0x3, &val);
0x3, &val))
return;
val &= ~amdgpu_connector->router.cd_mux_control_pin;
amdgpu_i2c_put_byte(amdgpu_connector->router_bus,
amdgpu_connector->router.i2c_addr,
0x3, val);
amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,
amdgpu_connector->router.i2c_addr,
0x1, &val);
0x1, &val))
return;
val &= ~amdgpu_connector->router.cd_mux_control_pin;
val |= amdgpu_connector->router.cd_mux_state;
amdgpu_i2c_put_byte(amdgpu_connector->router_bus,

View File

@@ -445,6 +445,14 @@ void amdgpu_irq_dispatch(struct amdgpu_device *adev,
entry.ih = ih;
entry.iv_entry = (const uint32_t *)&ih->ring[ring_index];
/*
* timestamp is not supported on some legacy SOCs (cik, cz, iceland,
* si and tonga), so initialize timestamp and timestamp_src to 0
*/
entry.timestamp = 0;
entry.timestamp_src = 0;
amdgpu_ih_decode_iv(adev, &entry);
trace_amdgpu_iv(ih - &adev->irq.ih, &entry);

View File

@@ -149,38 +149,7 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags)
goto out;
}
adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
if (amdgpu_device_supports_px(dev) &&
(amdgpu_runtime_pm != 0)) { /* enable PX as runtime mode */
adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
dev_info(adev->dev, "Using ATPX for runtime pm\n");
} else if (amdgpu_device_supports_boco(dev) &&
(amdgpu_runtime_pm != 0)) { /* enable boco as runtime mode */
adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
dev_info(adev->dev, "Using BOCO for runtime pm\n");
} else if (amdgpu_device_supports_baco(dev) &&
(amdgpu_runtime_pm != 0)) {
switch (adev->asic_type) {
case CHIP_VEGA20:
case CHIP_ARCTURUS:
/* enable BACO as runpm mode if runpm=1 */
if (amdgpu_runtime_pm > 0)
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
break;
case CHIP_VEGA10:
/* enable BACO as runpm mode if noretry=0 */
if (!adev->gmc.noretry)
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
break;
default:
/* enable BACO as runpm mode on CI+ */
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
break;
}
if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO)
dev_info(adev->dev, "Using BACO for runtime pm\n");
}
amdgpu_device_detect_runtime_pm_mode(adev);
/* Call ACPI methods: require modeset init
* but failure is not fatal

View File

@@ -210,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
return -EOPNOTSUPP;
}
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry)
static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry,
struct ras_query_context *qctx)
{
dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n");
dev_info(adev->dev, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_STATUS]);
dev_info(adev->dev, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_ADDR]);
dev_info(adev->dev, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_MISC0]);
dev_info(adev->dev, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_IPID]);
dev_info(adev->dev, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_SYND]);
u64 event_id = qctx->event_id;
RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_STATUS]);
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_ADDR]);
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_MISC0]);
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_IPID]);
RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
idx, entry->regs[MCA_REG_IDX_SYND]);
}
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
struct ras_err_data *err_data, struct ras_query_context *qctx)
{
struct amdgpu_smuio_mcm_config_info mcm_info;
struct ras_err_addr err_addr = {0};
@@ -244,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
list_for_each_entry(node, &mca_set.list, node) {
entry = &node->entry;
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry);
amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);
count = 0;
ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count);

View File

@@ -169,6 +169,7 @@ void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root
void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set);
int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry);
void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set);
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data);
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
struct ras_err_data *err_data, struct ras_query_context *qctx);
#endif

View File

@@ -32,6 +32,18 @@
#define AMDGPU_MES_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
#define AMDGPU_ONE_DOORBELL_SIZE 8
signed long amdgpu_mes_fence_wait_polling(u64 *fence,
u64 wait_seq,
signed long timeout)
{
while ((s64)(wait_seq - *fence) > 0 && timeout > 0) {
udelay(2);
timeout -= 2;
}
return timeout > 0 ? timeout : 0;
}
int amdgpu_mes_doorbell_process_slice(struct amdgpu_device *adev)
{
return roundup(AMDGPU_ONE_DOORBELL_SIZE *
@@ -40,7 +52,6 @@ int amdgpu_mes_doorbell_process_slice(struct amdgpu_device *adev)
}
static int amdgpu_mes_kernel_doorbell_get(struct amdgpu_device *adev,
struct amdgpu_mes_process *process,
int ip_type, uint64_t *doorbell_index)
{
unsigned int offset, found;
@@ -65,7 +76,6 @@ static int amdgpu_mes_kernel_doorbell_get(struct amdgpu_device *adev,
}
static void amdgpu_mes_kernel_doorbell_free(struct amdgpu_device *adev,
struct amdgpu_mes_process *process,
uint32_t doorbell_index)
{
unsigned int old, rel_index;
@@ -656,7 +666,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
*queue_id = queue->queue_id = r;
/* allocate a doorbell index for the queue */
r = amdgpu_mes_kernel_doorbell_get(adev, gang->process,
r = amdgpu_mes_kernel_doorbell_get(adev,
qprops->queue_type,
&qprops->doorbell_off);
if (r)
@@ -714,8 +724,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
return 0;
clean_up_doorbell:
amdgpu_mes_kernel_doorbell_free(adev, gang->process,
qprops->doorbell_off);
amdgpu_mes_kernel_doorbell_free(adev, qprops->doorbell_off);
clean_up_queue_id:
spin_lock_irqsave(&adev->mes.queue_id_lock, flags);
idr_remove(&adev->mes.queue_id_idr, queue->queue_id);
@@ -769,8 +778,7 @@ int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id)
queue_id);
list_del(&queue->list);
amdgpu_mes_kernel_doorbell_free(adev, gang->process,
queue->doorbell_off);
amdgpu_mes_kernel_doorbell_free(adev, queue->doorbell_off);
amdgpu_mes_unlock(&adev->mes);
amdgpu_mes_queue_free_mqd(queue);
@@ -778,6 +786,28 @@ int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id)
return 0;
}
int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev,
struct amdgpu_ring *ring)
{
struct mes_map_legacy_queue_input queue_input;
int r;
memset(&queue_input, 0, sizeof(queue_input));
queue_input.queue_type = ring->funcs->type;
queue_input.doorbell_offset = ring->doorbell_index;
queue_input.pipe_id = ring->pipe;
queue_input.queue_id = ring->queue;
queue_input.mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj);
queue_input.wptr_addr = ring->wptr_gpu_addr;
r = adev->mes.funcs->map_legacy_queue(&adev->mes, &queue_input);
if (r)
DRM_ERROR("failed to map legacy queue\n");
return r;
}
int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,
struct amdgpu_ring *ring,
enum amdgpu_unmap_queues_action action,
@@ -1475,7 +1505,7 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe)
const struct mes_firmware_header_v1_0 *mes_hdr;
struct amdgpu_firmware_info *info;
char ucode_prefix[30];
char fw_name[40];
char fw_name[50];
bool need_retry = false;
int r;

View File

@@ -141,6 +141,12 @@ struct amdgpu_mes {
/* ip specific functions */
const struct amdgpu_mes_funcs *funcs;
/* mes resource_1 bo*/
struct amdgpu_bo *resource_1;
uint64_t resource_1_gpu_addr;
void *resource_1_addr;
};
struct amdgpu_mes_process {
@@ -242,6 +248,15 @@ struct mes_remove_queue_input {
uint64_t gang_context_addr;
};
struct mes_map_legacy_queue_input {
uint32_t queue_type;
uint32_t doorbell_offset;
uint32_t pipe_id;
uint32_t queue_id;
uint64_t mqd_addr;
uint64_t wptr_addr;
};
struct mes_unmap_legacy_queue_input {
enum amdgpu_unmap_queues_action action;
uint32_t queue_type;
@@ -318,6 +333,9 @@ struct amdgpu_mes_funcs {
int (*remove_hw_queue)(struct amdgpu_mes *mes,
struct mes_remove_queue_input *input);
int (*map_legacy_queue)(struct amdgpu_mes *mes,
struct mes_map_legacy_queue_input *input);
int (*unmap_legacy_queue)(struct amdgpu_mes *mes,
struct mes_unmap_legacy_queue_input *input);
@@ -334,6 +352,10 @@ struct amdgpu_mes_funcs {
#define amdgpu_mes_kiq_hw_init(adev) (adev)->mes.kiq_hw_init((adev))
#define amdgpu_mes_kiq_hw_fini(adev) (adev)->mes.kiq_hw_fini((adev))
signed long amdgpu_mes_fence_wait_polling(u64 *fence,
u64 wait_seq,
signed long timeout);
int amdgpu_mes_ctx_get_offs(struct amdgpu_ring *ring, unsigned int id_offs);
int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe);
@@ -357,6 +379,8 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,
int *queue_id);
int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id);
int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev,
struct amdgpu_ring *ring);
int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,
struct amdgpu_ring *ring,
enum amdgpu_unmap_queues_action action,

View File

@@ -63,6 +63,8 @@ struct amdgpu_mmhub_funcs {
uint64_t page_table_base);
void (*update_power_gating)(struct amdgpu_device *adev,
bool enable);
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev,
int hub_inst);
};
struct amdgpu_mmhub {

View File

@@ -39,6 +39,7 @@
#include "amdgpu.h"
#include "amdgpu_trace.h"
#include "amdgpu_amdkfd.h"
#include "amdgpu_vram_mgr.h"
/**
* DOC: amdgpu_object
@@ -153,8 +154,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
else
places[c].flags |= TTM_PL_FLAG_TOPDOWN;
if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
if (abo->tbo.type == ttm_bo_type_kernel &&
flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
places[c].flags |= TTM_PL_FLAG_CONTIGUOUS;
c++;
}
@@ -173,6 +176,12 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
abo->flags & AMDGPU_GEM_CREATE_PREEMPTIBLE ?
AMDGPU_PL_PREEMPT : TTM_PL_TT;
places[c].flags = 0;
/*
* When GTT is just an alternative to VRAM make sure that we
* only use it as fallback and still try to fill up VRAM first.
*/
if (domain & abo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)
places[c].flags |= TTM_PL_FLAG_FALLBACK;
c++;
}
@@ -595,8 +604,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
if (!amdgpu_bo_support_uswc(bo->flags))
bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC;
if (adev->ras_enabled)
bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
bo->tbo.bdev = &adev->mman.bdev;
if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA |
@@ -629,7 +637,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
bo->tbo.resource->mem_type == TTM_PL_VRAM) {
struct dma_fence *fence;
r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, &fence, true);
r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, &fence);
if (unlikely(r))
goto fail_unreserve;
@@ -759,7 +767,7 @@ int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence)
return amdgpu_copy_buffer(ring, shadow_addr, parent_addr,
amdgpu_bo_size(shadow), NULL, fence,
true, false, false);
true, false, 0);
}
/**
@@ -961,6 +969,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
if (!bo->placements[i].lpfn ||
(lpfn && lpfn < bo->placements[i].lpfn))
bo->placements[i].lpfn = lpfn;
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
bo->placements[i].mem_type == TTM_PL_VRAM)
bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
}
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
@@ -1366,8 +1378,9 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
return;
r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, &fence, true);
r = amdgpu_fill_buffer(abo, 0, bo->base.resv, &fence, true);
if (!WARN_ON(r)) {
amdgpu_vram_mgr_set_cleared(bo->resource);
amdgpu_bo_fence(abo, fence, false);
dma_fence_put(fence);
}

View File

@@ -1053,6 +1053,11 @@ static int psp_asd_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)
return 0;
/* bypass asd if display hardware is not available */
if (!amdgpu_device_has_display_hardware(psp->adev) &&
amdgpu_ip_version(psp->adev, MP0_HWIP, 0) >= IP_VERSION(13, 0, 10))
return 0;
psp->asd_context.mem_context.shared_mc_addr = 0;
psp->asd_context.mem_context.shared_mem_size = PSP_ASD_SHARED_MEM_SIZE;
psp->asd_context.ta_load_type = GFX_CMD_ID_LOAD_ASD;
@@ -2260,6 +2265,15 @@ static int psp_hw_start(struct psp_context *psp)
}
}
if ((is_psp_fw_valid(psp->ipkeymgr_drv)) &&
(psp->funcs->bootloader_load_ipkeymgr_drv != NULL)) {
ret = psp_bootloader_load_ipkeymgr_drv(psp);
if (ret) {
dev_err(adev->dev, "PSP load ipkeymgr_drv failed!\n");
return ret;
}
}
if ((is_psp_fw_valid(psp->sos)) &&
(psp->funcs->bootloader_load_sos != NULL)) {
ret = psp_bootloader_load_sos(psp);
@@ -2617,7 +2631,8 @@ static int psp_load_p2s_table(struct psp_context *psp)
struct amdgpu_firmware_info *ucode =
&adev->firmware.ucode[AMDGPU_UCODE_ID_P2S_TABLE];
if (adev->in_runpm && (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO))
if (adev->in_runpm && ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)))
return 0;
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
@@ -2647,7 +2662,8 @@ static int psp_load_smu_fw(struct psp_context *psp)
* Skip SMU FW reloading in case of using BACO for runpm only,
* as SMU is always alive.
*/
if (adev->in_runpm && (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO))
if (adev->in_runpm && ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) ||
(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)))
return 0;
if (!ucode->fw || amdgpu_sriov_vf(psp->adev))
@@ -3273,6 +3289,12 @@ static int parse_sos_bin_descriptor(struct psp_context *psp,
psp->ras_drv.size_bytes = le32_to_cpu(desc->size_bytes);
psp->ras_drv.start_addr = ucode_start_addr;
break;
case PSP_FW_TYPE_PSP_IPKEYMGR_DRV:
psp->ipkeymgr_drv.fw_version = le32_to_cpu(desc->fw_version);
psp->ipkeymgr_drv.feature_version = le32_to_cpu(desc->fw_version);
psp->ipkeymgr_drv.size_bytes = le32_to_cpu(desc->size_bytes);
psp->ipkeymgr_drv.start_addr = ucode_start_addr;
break;
default:
dev_warn(psp->adev->dev, "Unsupported PSP FW type: %d\n", desc->fw_type);
break;

View File

@@ -73,8 +73,10 @@ enum psp_bootloader_cmd {
PSP_BL__LOAD_KEY_DATABASE = 0x80000,
PSP_BL__LOAD_SOCDRV = 0xB0000,
PSP_BL__LOAD_DBGDRV = 0xC0000,
PSP_BL__LOAD_HADDRV = PSP_BL__LOAD_DBGDRV,
PSP_BL__LOAD_INTFDRV = 0xD0000,
PSP_BL__LOAD_RASDRV = 0xE0000,
PSP_BL__LOAD_RASDRV = 0xE0000,
PSP_BL__LOAD_IPKEYMGRDRV = 0xF0000,
PSP_BL__DRAM_LONG_TRAIN = 0x100000,
PSP_BL__DRAM_SHORT_TRAIN = 0x200000,
PSP_BL__LOAD_TOS_SPL_TABLE = 0x10000000,
@@ -117,6 +119,7 @@ struct psp_funcs {
int (*bootloader_load_intf_drv)(struct psp_context *psp);
int (*bootloader_load_dbg_drv)(struct psp_context *psp);
int (*bootloader_load_ras_drv)(struct psp_context *psp);
int (*bootloader_load_ipkeymgr_drv)(struct psp_context *psp);
int (*bootloader_load_sos)(struct psp_context *psp);
int (*ring_create)(struct psp_context *psp,
enum psp_ring_type ring_type);
@@ -336,6 +339,7 @@ struct psp_context {
struct psp_bin_desc intf_drv;
struct psp_bin_desc dbg_drv;
struct psp_bin_desc ras_drv;
struct psp_bin_desc ipkeymgr_drv;
/* tmr buffer */
struct amdgpu_bo *tmr_bo;
@@ -424,6 +428,9 @@ struct amdgpu_psp_funcs {
#define psp_bootloader_load_ras_drv(psp) \
((psp)->funcs->bootloader_load_ras_drv ? \
(psp)->funcs->bootloader_load_ras_drv((psp)) : 0)
#define psp_bootloader_load_ipkeymgr_drv(psp) \
((psp)->funcs->bootloader_load_ipkeymgr_drv ? \
(psp)->funcs->bootloader_load_ipkeymgr_drv((psp)) : 0)
#define psp_bootloader_load_sos(psp) \
((psp)->funcs->bootloader_load_sos ? (psp)->funcs->bootloader_load_sos((psp)) : 0)
#define psp_smu_reload_quirk(psp) \

View File

@@ -122,6 +122,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -1045,6 +1047,7 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct ras_manager *ras_mgr,
struct ras_err_data *err_data,
struct ras_query_context *qctx,
const char *blk_name,
bool is_ue,
bool is_de)
@@ -1052,27 +1055,28 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node;
struct ras_err_info *err_info;
u64 event_id = qctx->event_id;
if (is_ue) {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->ue_count) {
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld new uncorrectable hardware errors detected in %s block\n",
mcm_info->socket_id,
mcm_info->die_id,
err_info->ue_count,
blk_name);
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
"%lld new uncorrectable hardware errors detected in %s block\n",
mcm_info->socket_id,
mcm_info->die_id,
err_info->ue_count,
blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld uncorrectable hardware errors detected in total in %s block\n",
mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
"%lld uncorrectable hardware errors detected in total in %s block\n",
mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);
}
} else {
@@ -1081,44 +1085,44 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->de_count) {
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld new deferred hardware errors detected in %s block\n",
mcm_info->socket_id,
mcm_info->die_id,
err_info->de_count,
blk_name);
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
"%lld new deferred hardware errors detected in %s block\n",
mcm_info->socket_id,
mcm_info->die_id,
err_info->de_count,
blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld deferred hardware errors detected in total in %s block\n",
mcm_info->socket_id, mcm_info->die_id,
err_info->de_count, blk_name);
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
"%lld deferred hardware errors detected in total in %s block\n",
mcm_info->socket_id, mcm_info->die_id,
err_info->de_count, blk_name);
}
} else {
for_each_ras_error(err_node, err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
if (err_info->ce_count) {
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld new correctable hardware errors detected in %s block\n",
mcm_info->socket_id,
mcm_info->die_id,
err_info->ce_count,
blk_name);
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
"%lld new correctable hardware errors detected in %s block\n",
mcm_info->socket_id,
mcm_info->die_id,
err_info->ce_count,
blk_name);
}
}
for_each_ras_error(err_node, &ras_mgr->err_data) {
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, "
"%lld correctable hardware errors detected in total in %s block\n",
mcm_info->socket_id, mcm_info->die_id,
err_info->ce_count, blk_name);
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
"%lld correctable hardware errors detected in total in %s block\n",
mcm_info->socket_id, mcm_info->die_id,
err_info->ce_count, blk_name);
}
}
}
@@ -1131,77 +1135,79 @@ static inline bool err_data_has_source_info(struct ras_err_data *data)
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
struct ras_query_if *query_if,
struct ras_err_data *err_data)
struct ras_err_data *err_data,
struct ras_query_context *qctx)
{
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
const char *blk_name = get_ras_block_str(&query_if->head);
u64 event_id = qctx->event_id;
if (err_data->ce_count) {
if (err_data_has_source_info(err_data)) {
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
blk_name, false, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
"%ld correctable hardware errors "
"detected in %s block\n",
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
ras_mgr->err_data.ce_count,
blk_name);
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
"%ld correctable hardware errors "
"detected in %s block\n",
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
ras_mgr->err_data.ce_count,
blk_name);
} else {
dev_info(adev->dev, "%ld correctable hardware errors "
"detected in %s block\n",
ras_mgr->err_data.ce_count,
blk_name);
RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors "
"detected in %s block\n",
ras_mgr->err_data.ce_count,
blk_name);
}
}
if (err_data->ue_count) {
if (err_data_has_source_info(err_data)) {
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
blk_name, true, false);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
"%ld uncorrectable hardware errors "
"detected in %s block\n",
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
ras_mgr->err_data.ue_count,
blk_name);
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
"%ld uncorrectable hardware errors "
"detected in %s block\n",
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
ras_mgr->err_data.ue_count,
blk_name);
} else {
dev_info(adev->dev, "%ld uncorrectable hardware errors "
"detected in %s block\n",
ras_mgr->err_data.ue_count,
blk_name);
RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors "
"detected in %s block\n",
ras_mgr->err_data.ue_count,
blk_name);
}
}
if (err_data->de_count) {
if (err_data_has_source_info(err_data)) {
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data,
amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,
blk_name, false, true);
} else if (!adev->aid_mask &&
adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
"%ld deferred hardware errors "
"detected in %s block\n",
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
ras_mgr->err_data.de_count,
blk_name);
RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
"%ld deferred hardware errors "
"detected in %s block\n",
adev->smuio.funcs->get_socket_id(adev),
adev->smuio.funcs->get_die_id(adev),
ras_mgr->err_data.de_count,
blk_name);
} else {
dev_info(adev->dev, "%ld deferred hardware errors "
"detected in %s block\n",
ras_mgr->err_data.de_count,
blk_name);
RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors "
"detected in %s block\n",
ras_mgr->err_data.de_count,
blk_name);
}
}
}
@@ -1244,6 +1250,10 @@ int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
{
struct ras_manager *obj;
/* in resume phase, no need to create aca fs node */
if (adev->in_suspend || amdgpu_in_reset(adev))
return 0;
obj = get_ras_manager(adev, blk);
if (!obj)
return -EINVAL;
@@ -1265,7 +1275,8 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
}
static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
enum aca_error_type type, struct ras_err_data *err_data)
enum aca_error_type type, struct ras_err_data *err_data,
struct ras_query_context *qctx)
{
struct ras_manager *obj;
@@ -1273,7 +1284,7 @@ static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu
if (!obj)
return -EINVAL;
return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data);
return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx);
}
ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
@@ -1287,13 +1298,14 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
"ce", info.ce_count);
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count,
"ce", info.ce_count, "de", info.ue_count);
}
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
struct ras_query_if *info,
struct ras_err_data *err_data,
struct ras_query_context *qctx,
unsigned int error_query_mode)
{
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
@@ -1329,17 +1341,21 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
}
} else {
if (amdgpu_aca_is_enabled(adev)) {
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data);
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx);
if (ret)
return ret;
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data);
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx);
if (ret)
return ret;
ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx);
if (ret)
return ret;
} else {
/* FIXME: add code to check return value later */
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx);
}
}
@@ -1351,6 +1367,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data;
struct ras_query_context qctx;
unsigned int error_query_mode;
int ret;
@@ -1364,8 +1381,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
return -EINVAL;
memset(&qctx, 0, sizeof(qctx));
qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
ret = amdgpu_ras_query_error_status_helper(adev, info,
&err_data,
&qctx,
error_query_mode);
if (ret)
goto out_fini_err_data;
@@ -1376,7 +1397,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
info->ce_count = obj->err_data.ce_count;
info->de_count = obj->err_data.de_count;
amdgpu_ras_error_generate_report(adev, info, &err_data);
amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
out_fini_err_data:
amdgpu_ras_error_data_fini(&err_data);
@@ -2041,7 +2062,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
}
}
amdgpu_umc_poison_handler(adev, obj->head.block, false);
amdgpu_umc_poison_handler(adev, obj->head.block, 0);
if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
@@ -2061,6 +2082,17 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
{
dev_info(obj->adev->dev,
"Poison is created\n");
if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev);
amdgpu_ras_put_poison_req(obj->adev,
AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
atomic_inc(&con->page_retirement_req_cnt);
wake_up(&con->page_retirement_wq);
}
}
static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
@@ -2371,7 +2403,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
};
status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
data->bps[i].retired_page);
data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);
if (status == -EBUSY)
(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
else if (status == -ENOENT)
@@ -2384,6 +2416,19 @@ out:
return ret;
}
static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive, bool status)
{
struct amdgpu_device *tmp_adev;
if (hive) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
amdgpu_ras_set_fed(tmp_adev, status);
} else {
amdgpu_ras_set_fed(adev, status);
}
}
static void amdgpu_ras_do_recovery(struct work_struct *work)
{
struct amdgpu_ras *ras =
@@ -2393,8 +2438,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct list_head device_list, *device_list_handle = NULL;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
if (hive)
if (hive) {
atomic_set(&hive->ras_recovery, 1);
/* If any device which is part of the hive received RAS fatal
* error interrupt, set fatal error status on all. This
* condition will need a recovery, and flag will be cleared
* as part of recovery.
*/
list_for_each_entry(remote_adev, &hive->device_list,
gmc.xgmi.head)
if (amdgpu_ras_get_fed_status(remote_adev)) {
amdgpu_ras_set_fed_all(adev, hive, true);
break;
}
}
if (!ras->disable_ras_err_cnt_harvest) {
/* Build list of devices to query RAS related errors */
@@ -2439,18 +2497,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
/* For any RAS error that needs a full reset to
* recover, set the fatal error status
*/
if (hive) {
list_for_each_entry(remote_adev,
&hive->device_list,
gmc.xgmi.head)
amdgpu_ras_set_fed(remote_adev,
true);
} else {
amdgpu_ras_set_fed(adev, true);
}
psp_fatal_error_recovery_quirk(&adev->psp);
}
}
@@ -2516,9 +2562,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
goto out;
}
amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
AMDGPU_GPU_PAGE_SIZE);
amdgpu_ras_reserve_page(adev, bps[i].retired_page);
memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
data->count++;
@@ -2674,10 +2718,167 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
}
}
int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset)
{
int ret = 0;
struct ras_poison_msg poison_msg;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
memset(&poison_msg, 0, sizeof(poison_msg));
poison_msg.block = block;
poison_msg.pasid = pasid;
poison_msg.reset = reset;
poison_msg.pasid_fn = pasid_fn;
poison_msg.data = data;
ret = kfifo_put(&con->poison_fifo, poison_msg);
if (!ret) {
dev_err(adev->dev, "Poison message fifo is full!\n");
return -ENOSPC;
}
return 0;
}
static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev,
struct ras_poison_msg *poison_msg)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
return kfifo_get(&con->poison_fifo, poison_msg);
}
static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
{
mutex_init(&ecc_log->lock);
/* Set any value as siphash key */
memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
ecc_log->de_updated = false;
}
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
{
struct radix_tree_iter iter;
void __rcu **slot;
struct ras_ecc_err *ecc_err;
mutex_lock(&ecc_log->lock);
radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) {
ecc_err = radix_tree_deref_slot(slot);
kfree(ecc_err->err_pages.pfn);
kfree(ecc_err);
radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot);
}
mutex_unlock(&ecc_log->lock);
mutex_destroy(&ecc_log->lock);
ecc_log->de_updated = false;
}
static void amdgpu_ras_do_page_retirement(struct work_struct *work)
{
struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
page_retirement_dwork.work);
struct amdgpu_device *adev = con->adev;
struct ras_err_data err_data;
if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
return;
amdgpu_ras_error_data_init(&err_data);
amdgpu_umc_handle_bad_pages(adev, &err_data);
amdgpu_ras_error_data_fini(&err_data);
mutex_lock(&con->umc_ecc_log.lock);
if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
UMC_ECC_NEW_DETECTED_TAG))
schedule_delayed_work(&con->page_retirement_dwork,
msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
mutex_unlock(&con->umc_ecc_log.lock);
}
static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
{
int ret = 0;
struct ras_ecc_log_info *ecc_log;
struct ras_query_if info;
uint32_t timeout = timeout_ms;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
memset(&info, 0, sizeof(info));
info.head.block = ras_block;
ecc_log = &ras->umc_ecc_log;
ecc_log->de_updated = false;
do {
ret = amdgpu_ras_query_error_status(adev, &info);
if (ret) {
dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
return ret;
}
if (timeout && !ecc_log->de_updated) {
msleep(1);
timeout--;
}
} while (timeout && !ecc_log->de_updated);
if (timeout_ms && !timeout) {
dev_warn(adev->dev, "Can't find deferred error\n");
return -ETIMEDOUT;
}
return 0;
}
static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
uint32_t timeout)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret;
ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
if (!ret)
schedule_delayed_work(&con->page_retirement_dwork, 0);
}
static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
struct ras_poison_msg *poison_msg)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint32_t reset = poison_msg->reset;
uint16_t pasid = poison_msg->pasid;
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (poison_msg->pasid_fn)
poison_msg->pasid_fn(adev, pasid, poison_msg->data);
if (reset) {
flush_delayed_work(&con->page_retirement_dwork);
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
return 0;
}
static int amdgpu_ras_page_retirement_thread(void *param)
{
struct amdgpu_device *adev = (struct amdgpu_device *)param;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_poison_msg poison_msg;
enum amdgpu_ras_block ras_block;
bool poison_creation_is_handled = false;
while (!kthread_should_stop()) {
@@ -2688,13 +2889,34 @@ static int amdgpu_ras_page_retirement_thread(void *param)
if (kthread_should_stop())
break;
dev_info(adev->dev, "Start processing page retirement. request:%d\n",
atomic_read(&con->page_retirement_req_cnt));
atomic_dec(&con->page_retirement_req_cnt);
amdgpu_umc_bad_page_polling_timeout(adev,
false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
continue;
ras_block = poison_msg.block;
dev_info(adev->dev, "Start processing ras block %s(%d)\n",
ras_block_str(ras_block), ras_block);
if (ras_block == AMDGPU_RAS_BLOCK__UMC) {
amdgpu_ras_poison_creation_handler(adev,
MAX_UMC_POISON_POLLING_TIME_ASYNC);
poison_creation_is_handled = true;
} else {
/* poison_creation_is_handled:
* false: no poison creation interrupt, but it has poison
* consumption interrupt.
* true: It has poison creation interrupt at the beginning,
* but it has no poison creation interrupt later.
*/
amdgpu_ras_poison_creation_handler(adev,
poison_creation_is_handled ?
0 : MAX_UMC_POISON_POLLING_TIME_ASYNC);
amdgpu_ras_poison_consumption_handler(adev, &poison_msg);
poison_creation_is_handled = false;
}
}
return 0;
@@ -2763,6 +2985,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
}
}
mutex_init(&con->page_rsv_lock);
INIT_KFIFO(con->poison_fifo);
mutex_init(&con->page_retirement_lock);
init_waitqueue_head(&con->page_retirement_wq);
atomic_set(&con->page_retirement_req_cnt, 0);
@@ -2773,6 +2997,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
}
INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
#ifdef CONFIG_X86_MCE_AMD
if ((adev->asic_type == CHIP_ALDEBARAN) &&
(adev->gmc.xgmi.connected_to_cpu))
@@ -2813,8 +3039,14 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
atomic_set(&con->page_retirement_req_cnt, 0);
mutex_destroy(&con->page_rsv_lock);
cancel_work_sync(&con->recovery_work);
cancel_delayed_work_sync(&con->page_retirement_dwork);
amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
mutex_lock(&con->recovery_lock);
con->eh_data = NULL;
kfree(data->bps);
@@ -3036,6 +3268,35 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)
AMDGPU_RAS_ERROR__PARITY;
}
static void ras_event_mgr_init(struct ras_event_manager *mgr)
{
int i;
for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++)
atomic64_set(&mgr->seqnos[i], 0);
}
static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
struct amdgpu_hive_info *hive;
if (!ras)
return;
hive = amdgpu_get_xgmi_hive(adev);
ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
/* init event manager with node 0 on xgmi system */
if (!amdgpu_in_reset(adev)) {
if (!hive || adev->gmc.xgmi.node_id == 0)
ras_event_mgr_init(ras->event_mgr);
}
if (hive)
amdgpu_put_xgmi_hive(hive);
}
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3356,6 +3617,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
amdgpu_ras_event_mgr_init(adev);
if (amdgpu_aca_is_enabled(adev)) {
if (amdgpu_in_reset(adev))
r = amdgpu_aca_reset(adev);
@@ -3472,14 +3735,39 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
atomic_set(&ras->fed, !!status);
}
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id)
{
return !(id & BIT_ULL(63));
}
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
u64 id;
switch (type) {
case RAS_EVENT_TYPE_ISR:
id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]);
break;
case RAS_EVENT_TYPE_INVALID:
default:
id = BIT_ULL(63) | 0ULL;
break;
}
return id;
}
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
{
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]);
dev_info(adev->dev, "uncorrectable hardware error"
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
amdgpu_ras_set_fed(adev, true);
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
amdgpu_ras_reset_gpu(adev);
}
@@ -3998,6 +4286,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_a
{
struct ras_err_addr *mca_err_addr;
/* This function will be retired. */
return;
mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
if (!mca_err_addr)
return;
@@ -4195,3 +4485,19 @@ void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);
}
}
int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT;
int ret = 0;
mutex_lock(&con->page_rsv_lock);
ret = amdgpu_vram_mgr_query_page_status(mgr, start);
if (ret == -ENOENT)
ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE);
mutex_unlock(&con->page_rsv_lock);
return ret;
}

View File

@@ -26,6 +26,9 @@
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/kfifo.h>
#include <linux/radix-tree.h>
#include <linux/siphash.h>
#include "ta_ras_if.h"
#include "amdgpu_ras_eeprom.h"
#include "amdgpu_smuio.h"
@@ -64,6 +67,14 @@ struct amdgpu_iv_entry;
/* The high three bits indicates socketid */
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
#define RAS_EVENT_LOG(_adev, _id, _fmt, ...) \
do { \
if (amdgpu_ras_event_id_is_valid((_adev), (_id))) \
dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__); \
else \
dev_info((_adev)->dev, _fmt, ##__VA_ARGS__); \
} while (0)
enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
AMDGPU_RAS_BLOCK__SDMA,
@@ -419,6 +430,52 @@ struct umc_ecc_info {
int record_ce_addr_supported;
};
enum ras_event_type {
RAS_EVENT_TYPE_INVALID = -1,
RAS_EVENT_TYPE_ISR = 0,
RAS_EVENT_TYPE_COUNT,
};
struct ras_event_manager {
atomic64_t seqnos[RAS_EVENT_TYPE_COUNT];
};
struct ras_query_context {
enum ras_event_type type;
u64 event_id;
};
typedef int (*pasid_notify)(struct amdgpu_device *adev,
uint16_t pasid, void *data);
struct ras_poison_msg {
enum amdgpu_ras_block block;
uint16_t pasid;
uint32_t reset;
pasid_notify pasid_fn;
void *data;
};
struct ras_err_pages {
uint32_t count;
uint64_t *pfn;
};
struct ras_ecc_err {
u64 hash_index;
uint64_t status;
uint64_t ipid;
uint64_t addr;
struct ras_err_pages err_pages;
};
struct ras_ecc_log_info {
struct mutex lock;
siphash_key_t ecc_key;
struct radix_tree_root de_page_tree;
bool de_updated;
};
struct amdgpu_ras {
/* ras infrastructure */
/* for ras itself. */
@@ -477,8 +534,18 @@ struct amdgpu_ras {
wait_queue_head_t page_retirement_wq;
struct mutex page_retirement_lock;
atomic_t page_retirement_req_cnt;
struct mutex page_rsv_lock;
DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
struct ras_ecc_log_info umc_ecc_log;
struct delayed_work page_retirement_dwork;
/* Fatal error detected flag */
atomic_t fed;
/* RAS event manager */
struct ras_event_manager __event_mgr;
struct ras_event_manager *event_mgr;
};
struct ras_fs_data {
@@ -512,6 +579,7 @@ struct ras_err_data {
unsigned long de_count;
unsigned long err_addr_cnt;
struct eeprom_table_record *err_addr;
unsigned long err_addr_len;
u32 err_list_count;
struct list_head err_node_list;
};
@@ -879,4 +947,13 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,
void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);
int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset);
#endif

View File

@@ -404,6 +404,22 @@ static int amdgpu_ras_eeprom_correct_header_tag(
return res;
}
static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
case IP_VERSION(8, 10, 0):
case IP_VERSION(12, 0, 0):
hdr->version = RAS_TABLE_VER_V2_1;
return;
default:
hdr->version = RAS_TABLE_VER_V1;
return;
}
}
/**
* amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
* @control: pointer to control structure
@@ -423,11 +439,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
mutex_lock(&control->ras_tbl_mutex);
hdr->header = RAS_TABLE_HDR_VAL;
if (adev->umc.ras &&
adev->umc.ras->set_eeprom_table_version)
adev->umc.ras->set_eeprom_table_version(hdr);
else
hdr->version = RAS_TABLE_VER_V1;
amdgpu_ras_set_eeprom_table_version(control);
if (hdr->version == RAS_TABLE_VER_V2_1) {
hdr->first_rec_offset = RAS_RECORD_START_V2_1;

View File

@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size)
}
}
/**
* amdgpu_res_cleared - check if blocks are cleared
*
* @cur: the cursor to extract the block
*
* Check if the @cur block is cleared
*/
static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
{
struct drm_buddy_block *block;
switch (cur->mem_type) {
case TTM_PL_VRAM:
block = cur->node;
if (!amdgpu_vram_mgr_is_cleared(block))
return false;
break;
default:
return false;
}
return true;
}
#endif

View File

@@ -21,9 +21,6 @@
*
*/
#include <linux/devcoredump.h>
#include <generated/utsrelease.h>
#include "amdgpu_reset.h"
#include "aldebaran.h"
#include "sienna_cichlid.h"
@@ -161,105 +158,3 @@ void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
atomic_set(&reset_domain->in_gpu_reset, 0);
up_write(&reset_domain->sem);
}
#ifndef CONFIG_DEV_COREDUMP
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
struct amdgpu_reset_context *reset_context)
{
}
#else
static ssize_t
amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
void *data, size_t datalen)
{
struct drm_printer p;
struct amdgpu_coredump_info *coredump = data;
struct drm_print_iterator iter;
int i;
iter.data = buffer;
iter.offset = 0;
iter.start = offset;
iter.remain = count;
p = drm_coredump_printer(&iter);
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
coredump->reset_time.tv_nsec);
if (coredump->reset_task_info.pid)
drm_printf(&p, "process_name: %s PID: %d\n",
coredump->reset_task_info.process_name,
coredump->reset_task_info.pid);
if (coredump->ring) {
drm_printf(&p, "\nRing timed out details\n");
drm_printf(&p, "IP Type: %d Ring Name: %s\n",
coredump->ring->funcs->type,
coredump->ring->name);
}
if (coredump->reset_vram_lost)
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
if (coredump->adev->reset_info.num_regs) {
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
drm_printf(&p, "0x%08x: 0x%08x\n",
coredump->adev->reset_info.reset_dump_reg_list[i],
coredump->adev->reset_info.reset_dump_reg_value[i]);
}
return count - iter.remain;
}
static void amdgpu_devcoredump_free(void *data)
{
kfree(data);
}
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
struct amdgpu_reset_context *reset_context)
{
struct amdgpu_coredump_info *coredump;
struct drm_device *dev = adev_to_drm(adev);
struct amdgpu_job *job = reset_context->job;
struct drm_sched_job *s_job;
coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
if (!coredump) {
DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
return;
}
coredump->reset_vram_lost = vram_lost;
if (reset_context->job && reset_context->job->vm) {
struct amdgpu_task_info *ti;
struct amdgpu_vm *vm = reset_context->job->vm;
ti = amdgpu_vm_get_task_info_vm(vm);
if (ti) {
coredump->reset_task_info = *ti;
amdgpu_vm_put_task_info(ti);
}
}
if (job) {
s_job = &job->base;
coredump->ring = to_amdgpu_ring(s_job->sched);
}
coredump->adev = adev;
ktime_get_ts64(&coredump->reset_time);
dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
}
#endif

View File

@@ -32,6 +32,7 @@ enum AMDGPU_RESET_FLAGS {
AMDGPU_NEED_FULL_RESET = 0,
AMDGPU_SKIP_HW_RESET = 1,
AMDGPU_SKIP_COREDUMP = 2,
};
struct amdgpu_reset_context {
@@ -88,19 +89,6 @@ struct amdgpu_reset_domain {
atomic_t reset_res;
};
#ifdef CONFIG_DEV_COREDUMP
#define AMDGPU_COREDUMP_VERSION "1"
struct amdgpu_coredump_info {
struct amdgpu_device *adev;
struct amdgpu_task_info reset_task_info;
struct timespec64 reset_time;
bool reset_vram_lost;
struct amdgpu_ring *ring;
};
#endif
int amdgpu_reset_init(struct amdgpu_device *adev);
int amdgpu_reset_fini(struct amdgpu_device *adev);
@@ -141,9 +129,6 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
struct amdgpu_reset_context *reset_context);
#define for_each_handler(i, handler, reset_ctl) \
for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) && \
(handler = (*reset_ctl->reset_handlers)[i]); \

View File

@@ -132,7 +132,7 @@ struct amdgpu_buffer_funcs {
uint64_t dst_offset,
/* number of byte to transfer */
uint32_t byte_count,
bool tmz);
uint32_t copy_flags);
/* maximum bytes in a single operation */
uint32_t fill_max_bytes;

View File

@@ -44,6 +44,7 @@ struct amdgpu_smuio_funcs {
u32 (*get_socket_id)(struct amdgpu_device *adev);
enum amdgpu_pkg_type (*get_pkg_type)(struct amdgpu_device *adev);
bool (*is_host_gpu_xgmi_supported)(struct amdgpu_device *adev);
u64 (*get_gpu_clock_counter)(struct amdgpu_device *adev);
};
struct amdgpu_smuio {

View File

@@ -236,7 +236,7 @@ static int amdgpu_ttm_map_buffer(struct ttm_buffer_object *bo,
dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
dst_addr += window * AMDGPU_GTT_MAX_TRANSFER_SIZE * 8;
amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr,
dst_addr, num_bytes, false);
dst_addr, num_bytes, 0);
amdgpu_ring_pad_ib(ring, &job->ibs[0]);
WARN_ON(job->ibs[0].length_dw > num_dw);
@@ -296,6 +296,8 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
struct dma_fence *fence = NULL;
int r = 0;
uint32_t copy_flags = 0;
if (!adev->mman.buffer_funcs_enabled) {
DRM_ERROR("Trying to move memory with ring turned off.\n");
return -EINVAL;
@@ -323,8 +325,11 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
if (r)
goto error;
r = amdgpu_copy_buffer(ring, from, to, cur_size,
resv, &next, false, true, tmz);
if (tmz)
copy_flags |= AMDGPU_COPY_FLAGS_TMZ;
r = amdgpu_copy_buffer(ring, from, to, cur_size, resv,
&next, false, true, copy_flags);
if (r)
goto error;
@@ -378,11 +383,12 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo,
(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
struct dma_fence *wipe_fence = NULL;
r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, &wipe_fence,
false);
r = amdgpu_fill_buffer(abo, 0, NULL, &wipe_fence,
false);
if (r) {
goto error;
} else if (wipe_fence) {
amdgpu_vram_mgr_set_cleared(bo->resource);
dma_fence_put(fence);
fence = wipe_fence;
}
@@ -1492,7 +1498,7 @@ static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object *bo,
swap(src_addr, dst_addr);
amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr,
PAGE_SIZE, false);
PAGE_SIZE, 0);
amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job->ibs[0]);
WARN_ON(job->ibs[0].length_dw > num_dw);
@@ -2143,7 +2149,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
uint64_t dst_offset, uint32_t byte_count,
struct dma_resv *resv,
struct dma_fence **fence, bool direct_submit,
bool vm_needs_flush, bool tmz)
bool vm_needs_flush, uint32_t copy_flags)
{
struct amdgpu_device *adev = ring->adev;
unsigned int num_loops, num_dw;
@@ -2169,8 +2175,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
uint32_t cur_size_in_bytes = min(byte_count, max_bytes);
amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_offset,
dst_offset, cur_size_in_bytes, tmz);
dst_offset, cur_size_in_bytes, copy_flags);
src_offset += cur_size_in_bytes;
dst_offset += cur_size_in_bytes;
byte_count -= cur_size_in_bytes;
@@ -2230,6 +2235,71 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, uint32_t src_data,
return 0;
}
/**
* amdgpu_ttm_clear_buffer - clear memory buffers
* @bo: amdgpu buffer object
* @resv: reservation object
* @fence: dma_fence associated with the operation
*
* Clear the memory buffer resource.
*
* Returns:
* 0 for success or a negative error code on failure.
*/
int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo,
struct dma_resv *resv,
struct dma_fence **fence)
{
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
struct amdgpu_res_cursor cursor;
u64 addr;
int r;
if (!adev->mman.buffer_funcs_enabled)
return -EINVAL;
if (!fence)
return -EINVAL;
*fence = dma_fence_get_stub();
amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor);
mutex_lock(&adev->mman.gtt_window_lock);
while (cursor.remaining) {
struct dma_fence *next = NULL;
u64 size;
if (amdgpu_res_cleared(&cursor)) {
amdgpu_res_next(&cursor, cursor.size);
continue;
}
/* Never clear more than 256MiB at once to avoid timeouts */
size = min(cursor.size, 256ULL << 20);
r = amdgpu_ttm_map_buffer(&bo->tbo, bo->tbo.resource, &cursor,
1, ring, false, &size, &addr);
if (r)
goto err;
r = amdgpu_ttm_fill_mem(ring, 0, addr, size, resv,
&next, true, true);
if (r)
goto err;
dma_fence_put(*fence);
*fence = next;
amdgpu_res_next(&cursor, size);
}
err:
mutex_unlock(&adev->mman.gtt_window_lock);
return r;
}
int amdgpu_fill_buffer(struct amdgpu_bo *bo,
uint32_t src_data,
struct dma_resv *resv,

View File

@@ -38,8 +38,6 @@
#define AMDGPU_GTT_MAX_TRANSFER_SIZE 512
#define AMDGPU_GTT_NUM_TRANSFER_WINDOWS 2
#define AMDGPU_POISON 0xd0bed0be
extern const struct attribute_group amdgpu_vram_mgr_attr_group;
extern const struct attribute_group amdgpu_gtt_mgr_attr_group;
@@ -111,6 +109,8 @@ struct amdgpu_copy_mem {
unsigned long offset;
};
#define AMDGPU_COPY_FLAGS_TMZ (1 << 0)
int amdgpu_gtt_mgr_init(struct amdgpu_device *adev, uint64_t gtt_size);
void amdgpu_gtt_mgr_fini(struct amdgpu_device *adev);
int amdgpu_preempt_mgr_init(struct amdgpu_device *adev);
@@ -151,13 +151,16 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
uint64_t dst_offset, uint32_t byte_count,
struct dma_resv *resv,
struct dma_fence **fence, bool direct_submit,
bool vm_needs_flush, bool tmz);
bool vm_needs_flush, uint32_t copy_flags);
int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
const struct amdgpu_copy_mem *src,
const struct amdgpu_copy_mem *dst,
uint64_t size, bool tmz,
struct dma_resv *resv,
struct dma_fence **f);
int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo,
struct dma_resv *resv,
struct dma_fence **fence);
int amdgpu_fill_buffer(struct amdgpu_bo *bo,
uint32_t src_data,
struct dma_resv *resv,

View File

@@ -125,6 +125,7 @@ enum psp_fw_type {
PSP_FW_TYPE_PSP_INTF_DRV,
PSP_FW_TYPE_PSP_DBG_DRV,
PSP_FW_TYPE_PSP_RAS_DRV,
PSP_FW_TYPE_PSP_IPKEYMGR_DRV,
PSP_FW_TYPE_MAX_INDEX,
};

View File

@@ -21,10 +21,13 @@
*
*/
#include <linux/sort.h>
#include "amdgpu.h"
#include "umc_v6_7.h"
#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
#define MAX_UMC_HASH_STRING_SIZE 256
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr,
uint32_t ch_inst, uint32_t umc_inst)
@@ -63,6 +66,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
goto out_fini_err_data;
}
err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query;
/*
* Translate UMC channel address to Physical address
*/
@@ -86,7 +91,7 @@ out_fini_err_data:
return ret;
}
static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
@@ -118,6 +123,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
if(!err_data->err_addr)
dev_warn(adev->dev, "Failed to alloc memory for "
"umc error address record!\n");
else
err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
/* umc query_ras_error_address is also responsible for clearing
* error status
@@ -143,6 +150,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
if(!err_data->err_addr)
dev_warn(adev->dev, "Failed to alloc memory for "
"umc error address record!\n");
else
err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
/* umc query_ras_error_address is also responsible for clearing
* error status
@@ -170,6 +179,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
}
kfree(err_data->err_addr);
err_data->err_addr = NULL;
mutex_unlock(&con->page_retirement_lock);
}
@@ -177,7 +187,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry,
bool reset)
uint32_t reset)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -186,9 +196,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
amdgpu_umc_handle_bad_pages(adev, ras_error_status);
if (err_data->ue_count && reset) {
/* use mode-2 reset for poison consumption */
if (!entry)
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
@@ -196,7 +204,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
}
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
bool reset, uint32_t timeout_ms)
uint32_t reset, uint32_t timeout_ms)
{
struct ras_err_data err_data;
struct ras_common_if head = {
@@ -238,16 +246,16 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
if (reset) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
/* use mode-2 reset for poison consumption */
con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
}
return 0;
}
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset)
int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset)
{
int ret = AMDGPU_RAS_SUCCESS;
@@ -285,16 +293,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
amdgpu_ras_error_data_fini(&err_data);
} else {
if (reset) {
amdgpu_umc_bad_page_polling_timeout(adev,
reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
} else {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
amdgpu_ras_put_poison_req(adev,
block, pasid, pasid_fn, data, reset);
atomic_inc(&con->page_retirement_req_cnt);
wake_up(&con->page_retirement_wq);
}
}
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
@@ -307,11 +313,19 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
return ret;
}
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset)
{
return amdgpu_umc_pasid_poison_handler(adev,
block, 0, NULL, NULL, reset);
}
int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
void *ras_error_status,
struct amdgpu_iv_entry *entry)
{
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry,
AMDGPU_RAS_GPU_RESET_MODE1_RESET);
}
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
@@ -388,14 +402,20 @@ int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
return 0;
}
void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
uint64_t err_addr,
uint64_t retired_page,
uint32_t channel_index,
uint32_t umc_inst)
{
struct eeprom_table_record *err_rec =
&err_data->err_addr[err_data->err_addr_cnt];
struct eeprom_table_record *err_rec;
if (!err_data ||
!err_data->err_addr ||
(err_data->err_addr_cnt >= err_data->err_addr_len))
return -EINVAL;
err_rec = &err_data->err_addr[err_data->err_addr_cnt];
err_rec->address = err_addr;
/* page frame address is saved */
@@ -407,6 +427,8 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
err_rec->mcumc_id = umc_inst;
err_data->err_addr_cnt++;
return 0;
}
int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
@@ -439,3 +461,76 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
return 0;
}
int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
uint64_t status, uint64_t ipid, uint64_t addr)
{
if (adev->umc.ras->update_ecc_status)
return adev->umc.ras->update_ecc_status(adev,
status, ipid, addr);
return 0;
}
static int amdgpu_umc_uint64_cmp(const void *a, const void *b)
{
uint64_t *addr_a = (uint64_t *)a;
uint64_t *addr_b = (uint64_t *)b;
if (*addr_a > *addr_b)
return 1;
else if (*addr_a < *addr_b)
return -1;
else
return 0;
}
/* Use string hash to avoid logging the same bad pages repeatedly */
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
uint64_t *pfns, int len, uint64_t *val)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
char buf[MAX_UMC_HASH_STRING_SIZE] = {0};
int offset = 0, i = 0;
uint64_t hash_val;
if (!pfns || !len)
return -EINVAL;
sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL);
for (i = 0; i < len; i++)
offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]);
hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key);
*val = hash_val;
return 0;
}
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_ecc_log_info *ecc_log;
int ret;
ecc_log = &con->umc_ecc_log;
mutex_lock(&ecc_log->lock);
ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
if (!ret) {
struct ras_err_pages *err_pages = &ecc_err->err_pages;
int i;
/* Reserve memory */
for (i = 0; i < err_pages->count; i++)
amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
radix_tree_tag_set(ecc_tree,
ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
}
mutex_unlock(&ecc_log->lock);
return ret;
}

View File

@@ -52,6 +52,8 @@
#define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \
LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
/* Page retirement tag */
#define UMC_ECC_NEW_DETECTED_TAG 0x1
typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
uint32_t umc_inst, uint32_t ch_inst, void *data);
@@ -66,8 +68,8 @@ struct amdgpu_umc_ras {
void *ras_error_status);
bool (*check_ecc_err_status)(struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, void *ras_error_status);
/* support different eeprom table version for different asic */
void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr);
int (*update_ecc_status)(struct amdgpu_device *adev,
uint64_t status, uint64_t ipid, uint64_t addr);
};
struct amdgpu_umc_funcs {
@@ -103,11 +105,14 @@ struct amdgpu_umc {
int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, bool reset);
enum amdgpu_ras_block block, uint32_t reset);
int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset);
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
uint64_t err_addr,
uint64_t retired_page,
uint32_t channel_index,
@@ -123,5 +128,15 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
umc_func func, void *data);
int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
bool reset, uint32_t timeout_ms);
uint32_t reset, uint32_t timeout_ms);
int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
uint64_t status, uint64_t ipid, uint64_t addr);
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
uint64_t *pfns, int len, uint64_t *val);
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
void *ras_error_status);
#endif

View File

@@ -878,6 +878,8 @@ static const struct amd_ip_funcs umsch_mm_v4_0_ip_funcs = {
.hw_fini = umsch_mm_hw_fini,
.suspend = umsch_mm_suspend,
.resume = umsch_mm_resume,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
const struct amdgpu_ip_block_version umsch_mm_v4_0_ip_block = {

View File

@@ -743,7 +743,8 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p,
uint32_t created = 0;
uint32_t allocated = 0;
uint32_t tmp, handle = 0;
uint32_t *size = &tmp;
uint32_t dummy = 0xffffffff;
uint32_t *size = &dummy;
unsigned int idx;
int i, r = 0;

View File

@@ -93,7 +93,7 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work);
int amdgpu_vcn_early_init(struct amdgpu_device *adev)
{
char ucode_prefix[30];
char ucode_prefix[25];
char fw_name[40];
int r, i;
@@ -185,7 +185,10 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP)
bo_size += AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8);
if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0)) {
if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(5, 0, 0)) {
fw_shared_size = AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn5_fw_shared));
log_offset = offsetof(struct amdgpu_vcn5_fw_shared, fw_log);
} else if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0)) {
fw_shared_size = AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn4_fw_shared));
log_offset = offsetof(struct amdgpu_vcn4_fw_shared, fw_log);
} else {

View File

@@ -454,6 +454,16 @@ struct amdgpu_vcn_rb_metadata {
uint8_t pad[26];
};
struct amdgpu_vcn5_fw_shared {
uint32_t present_flag_0;
uint8_t pad[12];
struct amdgpu_fw_shared_unified_queue_struct sq;
uint8_t pad1[8];
struct amdgpu_fw_shared_fw_logging fw_log;
struct amdgpu_fw_shared_rb_setup rb_setup;
uint8_t pad2[4];
};
#define VCN_BLOCK_ENCODE_DISABLE_MASK 0x80
#define VCN_BLOCK_DECODE_DISABLE_MASK 0x40
#define VCN_BLOCK_QUEUE_DISABLE_MASK 0xC0

View File

@@ -32,6 +32,7 @@
#include "amdgpu.h"
#include "amdgpu_ras.h"
#include "amdgpu_reset.h"
#include "vi.h"
#include "soc15.h"
#include "nv.h"
@@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
return -EINVAL;
if (pf2vf_info->size > 1024) {
DRM_ERROR("invalid pf2vf message size\n");
dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size);
return -EINVAL;
}
@@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
adev->virt.fw_reserve.checksum_key, checksum);
if (checksum != checkval) {
DRM_ERROR("invalid pf2vf message\n");
dev_err(adev->dev,
"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
checksum, checkval);
return -EINVAL;
}
@@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
0, checksum);
if (checksum != checkval) {
DRM_ERROR("invalid pf2vf message\n");
dev_err(adev->dev,
"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
checksum, checkval);
return -EINVAL;
}
@@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
break;
default:
DRM_ERROR("invalid pf2vf version\n");
dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version);
return -EINVAL;
}
@@ -571,6 +576,11 @@ static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev)
vf2pf_info->decode_usage = 0;
vf2pf_info->dummy_page_addr = (uint64_t)adev->dummy_page_addr;
vf2pf_info->mes_info_addr = (uint64_t)adev->mes.resource_1_gpu_addr;
if (adev->mes.resource_1) {
vf2pf_info->mes_info_size = adev->mes.resource_1->tbo.base.size;
}
vf2pf_info->checksum =
amd_sriov_msg_checksum(
vf2pf_info, vf2pf_info->header.size, 0, 0);
@@ -584,8 +594,22 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
int ret;
ret = amdgpu_virt_read_pf2vf_data(adev);
if (ret)
if (ret) {
adev->virt.vf2pf_update_retry_cnt++;
if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
amdgpu_ras_set_fed(adev, true);
if (amdgpu_reset_domain_schedule(adev->reset_domain,
&adev->virt.flr_work))
return;
else
dev_err(adev->dev, "Failed to queue work! at %s", __func__);
}
goto out;
}
adev->virt.vf2pf_update_retry_cnt = 0;
amdgpu_virt_write_vf2pf_data(adev);
out:
@@ -606,6 +630,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
adev->virt.fw_reserve.p_pf2vf = NULL;
adev->virt.fw_reserve.p_vf2pf = NULL;
adev->virt.vf2pf_update_interval_ms = 0;
adev->virt.vf2pf_update_retry_cnt = 0;
if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!");
@@ -705,12 +730,6 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev)
adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE;
}
if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
/* VF MMIO access (except mailbox range) from CPU
* will be blocked during sriov runtime
*/
adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
/* we have the ability to check now */
if (amdgpu_sriov_vf(adev)) {
switch (adev->asic_type) {

View File

@@ -52,6 +52,8 @@
/* tonga/fiji use this offset */
#define mmBIF_IOV_FUNC_IDENTIFIER 0x1503
#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5
enum amdgpu_sriov_vf_mode {
SRIOV_VF_MODE_BARE_METAL = 0,
SRIOV_VF_MODE_ONE_VF,
@@ -130,6 +132,8 @@ enum AMDGIM_FEATURE_FLAG {
AMDGIM_FEATURE_AV1_SUPPORT = (1 << 6),
/* VCN RB decouple */
AMDGIM_FEATURE_VCN_RB_DECOUPLE = (1 << 7),
/* MES info */
AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8),
};
enum AMDGIM_REG_ACCESS_FLAG {
@@ -257,6 +261,7 @@ struct amdgpu_virt {
/* vf2pf message */
struct delayed_work vf2pf_work;
uint32_t vf2pf_update_interval_ms;
int vf2pf_update_retry_cnt;
/* multimedia bandwidth config */
bool is_mm_bw_enabled;
@@ -332,6 +337,8 @@ static inline bool is_virtual_machine(void)
((adev)->virt.gim_feature & AMDGIM_FEATURE_AV1_SUPPORT)
#define amdgpu_sriov_is_vcn_rb_decouple(adev) \
((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE)
#define amdgpu_sriov_is_mes_info_enable(adev) \
((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE)
bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
void amdgpu_virt_init_setting(struct amdgpu_device *adev);
int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);

View File

@@ -658,6 +658,8 @@ static const struct amd_ip_funcs amdgpu_vkms_ip_funcs = {
.soft_reset = amdgpu_vkms_soft_reset,
.set_clockgating_state = amdgpu_vkms_set_clockgating_state,
.set_powergating_state = amdgpu_vkms_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
const struct amdgpu_ip_block_version amdgpu_vkms_ip_block = {

View File

@@ -885,6 +885,44 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
kfree(tlb_cb);
}
/**
* amdgpu_vm_tlb_flush - prepare TLB flush
*
* @params: parameters for update
* @fence: input fence to sync TLB flush with
* @tlb_cb: the callback structure
*
* Increments the tlb sequence to make sure that future CS execute a VM flush.
*/
static void
amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,
struct dma_fence **fence,
struct amdgpu_vm_tlb_seq_struct *tlb_cb)
{
struct amdgpu_vm *vm = params->vm;
if (!fence || !*fence)
return;
tlb_cb->vm = vm;
if (!dma_fence_add_callback(*fence, &tlb_cb->cb,
amdgpu_vm_tlb_seq_cb)) {
dma_fence_put(vm->last_tlb_flush);
vm->last_tlb_flush = dma_fence_get(*fence);
} else {
amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
}
/* Prepare a TLB flush fence to be attached to PTs */
if (!params->unlocked && vm->is_compute_context) {
amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
/* Makes sure no PD/PT is freed before the flush */
dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence,
DMA_RESV_USAGE_BOOKKEEP);
}
}
/**
* amdgpu_vm_update_range - update a range in the vm page table
*
@@ -916,8 +954,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
struct ttm_resource *res, dma_addr_t *pages_addr,
struct dma_fence **fence)
{
struct amdgpu_vm_update_params params;
struct amdgpu_vm_tlb_seq_struct *tlb_cb;
struct amdgpu_vm_update_params params;
struct amdgpu_res_cursor cursor;
enum amdgpu_sync_mode sync_mode;
int r, idx;
@@ -927,8 +965,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);
if (!tlb_cb) {
r = -ENOMEM;
goto error_unlock;
drm_dev_exit(idx);
return -ENOMEM;
}
/* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,
@@ -948,7 +986,9 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
params.immediate = immediate;
params.pages_addr = pages_addr;
params.unlocked = unlocked;
params.needs_flush = flush_tlb;
params.allow_override = allow_override;
INIT_LIST_HEAD(&params.tlb_flush_waitlist);
/* Implicitly sync to command submissions in the same VM before
* unmapping. Sync to moving fences before mapping.
@@ -1031,24 +1071,18 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
}
r = vm->update_funcs->commit(&params, fence);
if (r)
goto error_free;
if (flush_tlb || params.table_freed) {
tlb_cb->vm = vm;
if (fence && *fence &&
!dma_fence_add_callback(*fence, &tlb_cb->cb,
amdgpu_vm_tlb_seq_cb)) {
dma_fence_put(vm->last_tlb_flush);
vm->last_tlb_flush = dma_fence_get(*fence);
} else {
amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb);
}
if (params.needs_flush) {
amdgpu_vm_tlb_flush(&params, fence, tlb_cb);
tlb_cb = NULL;
}
amdgpu_vm_pt_free_list(adev, &params);
error_free:
kfree(tlb_cb);
error_unlock:
amdgpu_vm_eviction_unlock(vm);
drm_dev_exit(idx);
return r;
@@ -2411,6 +2445,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
mutex_init(&vm->eviction_lock);
vm->evicting = false;
vm->tlb_fence_context = dma_fence_context_alloc(1);
r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
false, &root, xcp_id);
@@ -2944,6 +2979,14 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
if (vm && status) {
vm->fault_info.addr = addr;
vm->fault_info.status = status;
/*
* Update the fault information globally for later usage
* when vm could be stale or freed.
*/
adev->vm_manager.fault_info.addr = addr;
adev->vm_manager.fault_info.vmhub = vmhub;
adev->vm_manager.fault_info.status = status;
if (AMDGPU_IS_GFXHUB(vmhub)) {
vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;
vm->fault_info.vmhub |=

View File

@@ -257,15 +257,20 @@ struct amdgpu_vm_update_params {
unsigned int num_dw_left;
/**
* @table_freed: return true if page table is freed when updating
* @needs_flush: true whenever we need to invalidate the TLB
*/
bool table_freed;
bool needs_flush;
/**
* @allow_override: true for memory that is not uncached: allows MTYPE
* to be overridden for NUMA local memory.
*/
bool allow_override;
/**
* @tlb_flush_waitlist: temporary storage for BOs until tlb_flush
*/
struct list_head tlb_flush_waitlist;
};
struct amdgpu_vm_update_funcs {
@@ -342,6 +347,7 @@ struct amdgpu_vm {
atomic64_t tlb_seq;
struct dma_fence *last_tlb_flush;
atomic64_t kfd_last_flushed_seq;
uint64_t tlb_fence_context;
/* How many times we had to re-generate the page tables */
uint64_t generation;
@@ -422,6 +428,8 @@ struct amdgpu_vm_manager {
* look up VM of a page fault
*/
struct xarray pasids;
/* Global registration of recent page fault information */
struct amdgpu_vm_fault_info fault_info;
};
struct amdgpu_bo_va_mapping;
@@ -544,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
uint64_t start, uint64_t end,
uint64_t dst, uint64_t flags);
void amdgpu_vm_pt_free_work(struct work_struct *work);
void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
struct amdgpu_vm_update_params *params);
#if defined(CONFIG_DEBUG_FS)
void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m);
@@ -609,5 +619,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
uint64_t addr,
uint32_t status,
unsigned int vmhub);
void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev,
struct amdgpu_vm *vm,
struct dma_fence **fence);
#endif

View File

@@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,
static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
struct dma_fence **fence)
{
/* Flush HDP */
if (p->needs_flush)
atomic64_inc(&p->vm->tlb_seq);
mb();
amdgpu_device_flush_hdp(p->adev, NULL);
return 0;

View File

@@ -622,40 +622,58 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)
}
/**
* amdgpu_vm_pt_free_dfs - free PD/PT levels
* amdgpu_vm_pt_free_list - free PD/PT levels
*
* @adev: amdgpu device structure
* @vm: amdgpu vm structure
* @start: optional cursor where to start freeing PDs/PTs
* @unlocked: vm resv unlock status
* @params: see amdgpu_vm_update_params definition
*
* Free the page directory or page table level and all sub levels.
* Free the page directory objects saved in the flush list
*/
static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
struct amdgpu_vm *vm,
struct amdgpu_vm_pt_cursor *start,
bool unlocked)
void amdgpu_vm_pt_free_list(struct amdgpu_device *adev,
struct amdgpu_vm_update_params *params)
{
struct amdgpu_vm_pt_cursor cursor;
struct amdgpu_vm_bo_base *entry;
struct amdgpu_vm_bo_base *entry, *next;
struct amdgpu_vm *vm = params->vm;
bool unlocked = params->unlocked;
if (list_empty(&params->tlb_flush_waitlist))
return;
if (unlocked) {
spin_lock(&vm->status_lock);
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
list_move(&entry->vm_status, &vm->pt_freed);
if (start)
list_move(&start->entry->vm_status, &vm->pt_freed);
list_splice_init(&params->tlb_flush_waitlist, &vm->pt_freed);
spin_unlock(&vm->status_lock);
schedule_work(&vm->pt_free_work);
return;
}
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
list_for_each_entry_safe(entry, next, &params->tlb_flush_waitlist, vm_status)
amdgpu_vm_pt_free(entry);
}
if (start)
amdgpu_vm_pt_free(start->entry);
/**
* amdgpu_vm_pt_add_list - add PD/PT level to the flush list
*
* @params: parameters for the update
* @cursor: first PT entry to start DF search from, non NULL
*
* This list will be freed after TLB flush.
*/
static void amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params *params,
struct amdgpu_vm_pt_cursor *cursor)
{
struct amdgpu_vm_pt_cursor seek;
struct amdgpu_vm_bo_base *entry;
spin_lock(&params->vm->status_lock);
for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, cursor, seek, entry) {
if (entry && entry->bo)
list_move(&entry->vm_status, &params->tlb_flush_waitlist);
}
/* enter start node now */
list_move(&cursor->entry->vm_status, &params->tlb_flush_waitlist);
spin_unlock(&params->vm->status_lock);
}
/**
@@ -667,7 +685,13 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,
*/
void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm)
{
amdgpu_vm_pt_free_dfs(adev, vm, NULL, false);
struct amdgpu_vm_pt_cursor cursor;
struct amdgpu_vm_bo_base *entry;
for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) {
if (entry)
amdgpu_vm_pt_free(entry);
}
}
/**
@@ -972,10 +996,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,
while (cursor.pfn < frag_start) {
/* Make sure previous mapping is freed */
if (cursor.entry->bo) {
params->table_freed = true;
amdgpu_vm_pt_free_dfs(adev, params->vm,
&cursor,
params->unlocked);
params->needs_flush = true;
amdgpu_vm_pt_add_list(params, &cursor);
}
amdgpu_vm_pt_next(adev, &cursor);
}

View File

@@ -126,6 +126,10 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
WARN_ON(ib->length_dw == 0);
amdgpu_ring_pad_ib(ring, ib);
if (p->needs_flush)
atomic64_inc(&p->vm->tlb_seq);
WARN_ON(ib->length_dw > p->num_dw_left);
f = amdgpu_job_submit(p->job);

View File

@@ -0,0 +1,112 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright 2023 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <linux/dma-fence.h>
#include <linux/workqueue.h>
#include "amdgpu.h"
#include "amdgpu_vm.h"
#include "amdgpu_gmc.h"
struct amdgpu_tlb_fence {
struct dma_fence base;
struct amdgpu_device *adev;
struct dma_fence *dependency;
struct work_struct work;
spinlock_t lock;
uint16_t pasid;
};
static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence *fence)
{
return "amdgpu tlb fence";
}
static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f)
{
return "amdgpu tlb timeline";
}
static void amdgpu_tlb_fence_work(struct work_struct *work)
{
struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
int r;
if (f->dependency) {
dma_fence_wait(f->dependency, false);
dma_fence_put(f->dependency);
f->dependency = NULL;
}
r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
if (r) {
dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n",
f->pasid);
dma_fence_set_error(&f->base, r);
}
dma_fence_signal(&f->base);
dma_fence_put(&f->base);
}
static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
.use_64bit_seqno = true,
.get_driver_name = amdgpu_tlb_fence_get_driver_name,
.get_timeline_name = amdgpu_tlb_fence_get_timeline_name
};
void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
struct dma_fence **fence)
{
struct amdgpu_tlb_fence *f;
f = kmalloc(sizeof(*f), GFP_KERNEL);
if (!f) {
/*
* We can't fail since the PDEs and PTEs are already updated, so
* just block for the dependency and execute the TLB flush
*/
if (*fence)
dma_fence_wait(*fence, false);
amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0);
*fence = dma_fence_get_stub();
return;
}
f->adev = adev;
f->dependency = *fence;
f->pasid = vm->pasid;
INIT_WORK(&f->work, amdgpu_tlb_fence_work);
spin_lock_init(&f->lock);
dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock,
vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
/* TODO: We probably need a separate wq here */
dma_fence_get(&f->base);
schedule_work(&f->work);
*fence = &f->base;
}

View File

@@ -450,6 +450,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
{
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);
u64 vis_usage = 0, max_bytes, min_block_size;
struct amdgpu_vram_mgr_resource *vres;
u64 size, remaining_size, lpfn, fpfn;
@@ -468,7 +469,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
if (tbo->type != ttm_bo_type_kernel)
max_bytes -= AMDGPU_VM_RESERVED_VRAM;
if (place->flags & TTM_PL_FLAG_CONTIGUOUS) {
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
pages_per_block = ~0ul;
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -477,7 +478,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
/* default to 2MB */
pages_per_block = 2UL << (20UL - PAGE_SHIFT);
#endif
pages_per_block = max_t(uint32_t, pages_per_block,
pages_per_block = max_t(u32, pages_per_block,
tbo->page_alignment);
}
@@ -498,9 +499,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
if (place->flags & TTM_PL_FLAG_TOPDOWN)
vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION;
if (place->flags & TTM_PL_FLAG_CONTIGUOUS)
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)
vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION;
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED)
vres->flags |= DRM_BUDDY_CLEAR_ALLOCATION;
if (fpfn || lpfn != mgr->mm.size)
/* Allocate blocks in desired range */
vres->flags |= DRM_BUDDY_RANGE_ALLOCATION;
@@ -514,21 +518,31 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
else
min_block_size = mgr->default_page_size;
BUG_ON(min_block_size < mm->chunk_size);
/* Limit maximum size to 2GiB due to SG table limitations */
size = min(remaining_size, 2ULL << 30);
if ((size >= (u64)pages_per_block << PAGE_SHIFT) &&
!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))
min_block_size = (u64)pages_per_block << PAGE_SHIFT;
BUG_ON(min_block_size < mm->chunk_size);
r = drm_buddy_alloc_blocks(mm, fpfn,
lpfn,
size,
min_block_size,
&vres->blocks,
vres->flags);
if (unlikely(r == -ENOSPC) && pages_per_block == ~0ul &&
!(place->flags & TTM_PL_FLAG_CONTIGUOUS)) {
vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION;
pages_per_block = max_t(u32, 2UL << (20UL - PAGE_SHIFT),
tbo->page_alignment);
continue;
}
if (unlikely(r))
goto error_free_blocks;
@@ -571,7 +585,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
return 0;
error_free_blocks:
drm_buddy_free_list(mm, &vres->blocks);
drm_buddy_free_list(mm, &vres->blocks, 0);
mutex_unlock(&mgr->lock);
error_fini:
ttm_resource_fini(man, &vres->base);
@@ -604,7 +618,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man,
amdgpu_vram_mgr_do_reserve(man);
drm_buddy_free_list(mm, &vres->blocks);
drm_buddy_free_list(mm, &vres->blocks, vres->flags);
mutex_unlock(&mgr->lock);
atomic64_sub(vis_usage, &mgr->vis_usage);
@@ -912,7 +926,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
kfree(rsv);
list_for_each_entry_safe(rsv, temp, &mgr->reserved_pages, blocks) {
drm_buddy_free_list(&mgr->mm, &rsv->allocated);
drm_buddy_free_list(&mgr->mm, &rsv->allocated, 0);
kfree(rsv);
}
if (!adev->gmc.is_app_apu)

View File

@@ -53,10 +53,20 @@ static inline u64 amdgpu_vram_mgr_block_size(struct drm_buddy_block *block)
return (u64)PAGE_SIZE << drm_buddy_block_order(block);
}
static inline bool amdgpu_vram_mgr_is_cleared(struct drm_buddy_block *block)
{
return drm_buddy_block_is_clear(block);
}
static inline struct amdgpu_vram_mgr_resource *
to_amdgpu_vram_mgr_resource(struct ttm_resource *res)
{
return container_of(res, struct amdgpu_vram_mgr_resource, base);
}
static inline void amdgpu_vram_mgr_set_cleared(struct ttm_resource *res)
{
to_amdgpu_vram_mgr_resource(res)->flags |= DRM_BUDDY_CLEARED;
}
#endif

View File

@@ -1035,15 +1035,16 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
return 0;
}
static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type,
struct aca_bank_report *report, void *data)
static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
enum aca_smu_type type, void *data)
{
struct amdgpu_device *adev = handle->adev;
struct aca_bank_info info;
const char *error_str;
u64 status;
u64 status, count;
int ret, ext_error_code;
ret = aca_bank_info_decode(bank, &report->info);
ret = aca_bank_info_decode(bank, &info);
if (ret)
return ret;
@@ -1055,15 +1056,28 @@ static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struc
if (error_str)
dev_info(adev->dev, "%s detected\n", error_str);
if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) ||
(type == ACA_ERROR_TYPE_CE && ext_error_code == 6))
report->count[type] = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]);
return 0;
switch (type) {
case ACA_SMU_TYPE_UE:
if (ext_error_code != 0 && ext_error_code != 9)
count = 0ULL;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count);
break;
case ACA_SMU_TYPE_CE:
count = ext_error_code == 6 ? count : 0ULL;
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, count);
break;
default:
return -EINVAL;
}
return ret;
}
static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = {
.aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report,
.aca_bank_parser = xgmi_v6_4_0_aca_bank_parser,
};
static const struct aca_info xgmi_v6_4_0_aca_info = {

View File

@@ -44,6 +44,7 @@ struct amdgpu_hive_info {
struct amdgpu_reset_domain *reset_domain;
atomic_t ras_recovery;
struct ras_event_manager event_mgr;
};
struct amdgpu_pcs_ras_field {

View File

@@ -94,7 +94,8 @@ union amd_sriov_msg_feature_flags {
uint32_t reg_indirect_acc : 1;
uint32_t av1_support : 1;
uint32_t vcn_rb_decouple : 1;
uint32_t reserved : 24;
uint32_t mes_info_enable : 1;
uint32_t reserved : 23;
} flags;
uint32_t all;
};
@@ -157,7 +158,7 @@ struct amd_sriov_msg_pf2vf_info_header {
uint32_t reserved[2];
};
#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (48)
#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (49)
struct amd_sriov_msg_pf2vf_info {
/* header contains size and version */
struct amd_sriov_msg_pf2vf_info_header header;
@@ -208,6 +209,8 @@ struct amd_sriov_msg_pf2vf_info {
struct amd_sriov_msg_uuid_info uuid_info;
/* PCIE atomic ops support flag */
uint32_t pcie_atomic_ops_support_flags;
/* Portion of GPU memory occupied by VF. MAX value is 65535, but set to uint32_t to maintain alignment with reserved size */
uint32_t gpu_capacity;
/* reserved */
uint32_t reserved[256 - AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE];
};
@@ -221,7 +224,7 @@ struct amd_sriov_msg_vf2pf_info_header {
uint32_t reserved[2];
};
#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (70)
#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (73)
struct amd_sriov_msg_vf2pf_info {
/* header contains size and version */
struct amd_sriov_msg_vf2pf_info_header header;
@@ -265,7 +268,9 @@ struct amd_sriov_msg_vf2pf_info {
uint32_t version;
} ucode_info[AMD_SRIOV_MSG_RESERVE_UCODE];
uint64_t dummy_page_addr;
/* FB allocated for guest MES to record UQ info */
uint64_t mes_info_addr;
uint32_t mes_info_size;
/* reserved */
uint32_t reserved[256 - AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE];
};

View File

@@ -630,7 +630,7 @@ static int aqua_vanjaram_xcp_mgr_init(struct amdgpu_device *adev)
int aqua_vanjaram_init_soc_config(struct amdgpu_device *adev)
{
u32 mask, inst_mask = adev->sdma.sdma_mask;
u32 mask, avail_inst, inst_mask = adev->sdma.sdma_mask;
int ret, i;
/* generally 1 AID supports 4 instances */
@@ -642,7 +642,9 @@ int aqua_vanjaram_init_soc_config(struct amdgpu_device *adev)
for (mask = (1 << adev->sdma.num_inst_per_aid) - 1; inst_mask;
inst_mask >>= adev->sdma.num_inst_per_aid, ++i) {
if ((inst_mask & mask) == mask)
avail_inst = inst_mask & mask;
if (avail_inst == mask || avail_inst == 0x3 ||
avail_inst == 0xc)
adev->aid_mask |= (1 << i);
}

View File

@@ -1243,6 +1243,7 @@ static int amdgpu_atom_execute_table_locked(struct atom_context *ctx, int index,
ectx.ps_size = params_size;
ectx.abort = false;
ectx.last_jump = 0;
ectx.last_jump_jiffies = 0;
if (ws) {
ectx.ws = kcalloc(4, ws, GFP_KERNEL);
ectx.ws_size = ws;

View File

@@ -1375,14 +1375,14 @@ static int cik_asic_pci_config_reset(struct amdgpu_device *adev)
return r;
}
static bool cik_asic_supports_baco(struct amdgpu_device *adev)
static int cik_asic_supports_baco(struct amdgpu_device *adev)
{
switch (adev->asic_type) {
case CHIP_BONAIRE:
case CHIP_HAWAII:
return amdgpu_dpm_is_baco_supported(adev);
default:
return false;
return 0;
}
}
@@ -2210,6 +2210,8 @@ static const struct amd_ip_funcs cik_common_ip_funcs = {
.soft_reset = cik_common_soft_reset,
.set_clockgating_state = cik_common_set_clockgating_state,
.set_powergating_state = cik_common_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ip_block_version cik_common_ip_block =

View File

@@ -435,6 +435,8 @@ static const struct amd_ip_funcs cik_ih_ip_funcs = {
.soft_reset = cik_ih_soft_reset,
.set_clockgating_state = cik_ih_set_clockgating_state,
.set_powergating_state = cik_ih_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ih_funcs cik_ih_funcs = {

View File

@@ -1228,6 +1228,8 @@ static const struct amd_ip_funcs cik_sdma_ip_funcs = {
.soft_reset = cik_sdma_soft_reset,
.set_clockgating_state = cik_sdma_set_clockgating_state,
.set_powergating_state = cik_sdma_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs cik_sdma_ring_funcs = {
@@ -1290,7 +1292,7 @@ static void cik_sdma_set_irq_funcs(struct amdgpu_device *adev)
* @src_offset: src GPU address
* @dst_offset: dst GPU address
* @byte_count: number of bytes to xfer
* @tmz: is this a secure operation
* @copy_flags: unused
*
* Copy GPU buffers using the DMA engine (CIK).
* Used by the amdgpu ttm implementation to move pages if
@@ -1300,7 +1302,7 @@ static void cik_sdma_emit_copy_buffer(struct amdgpu_ib *ib,
uint64_t src_offset,
uint64_t dst_offset,
uint32_t byte_count,
bool tmz)
uint32_t copy_flags)
{
ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0);
ib->ptr[ib->length_dw++] = byte_count;

View File

@@ -433,6 +433,8 @@ static const struct amd_ip_funcs cz_ih_ip_funcs = {
.soft_reset = cz_ih_soft_reset,
.set_clockgating_state = cz_ih_set_clockgating_state,
.set_powergating_state = cz_ih_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ih_funcs cz_ih_funcs = {

View File

@@ -3333,6 +3333,8 @@ static const struct amd_ip_funcs dce_v10_0_ip_funcs = {
.soft_reset = dce_v10_0_soft_reset,
.set_clockgating_state = dce_v10_0_set_clockgating_state,
.set_powergating_state = dce_v10_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static void

View File

@@ -3464,6 +3464,8 @@ static const struct amd_ip_funcs dce_v11_0_ip_funcs = {
.soft_reset = dce_v11_0_soft_reset,
.set_clockgating_state = dce_v11_0_set_clockgating_state,
.set_powergating_state = dce_v11_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static void

View File

@@ -3154,6 +3154,8 @@ static const struct amd_ip_funcs dce_v6_0_ip_funcs = {
.soft_reset = dce_v6_0_soft_reset,
.set_clockgating_state = dce_v6_0_set_clockgating_state,
.set_powergating_state = dce_v6_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static void

View File

@@ -3242,6 +3242,8 @@ static const struct amd_ip_funcs dce_v8_0_ip_funcs = {
.soft_reset = dce_v8_0_soft_reset,
.set_clockgating_state = dce_v8_0_set_clockgating_state,
.set_powergating_state = dce_v8_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static void

View File

@@ -276,6 +276,99 @@ MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec.bin");
MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec2.bin");
MODULE_FIRMWARE("amdgpu/gc_10_3_7_rlc.bin");
static const struct amdgpu_hwip_reg_entry gc_reg_list_10_1[] = {
SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS2),
SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS3),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT1),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT2),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STALLED_STAT1),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STALLED_STAT1),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_BUSY_STAT),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT2),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT2),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_ERROR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_HPD_STATUS0),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_BASE),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_RPTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_WPTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_BASE),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_RPTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_WPTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_BASE),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_RPTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_WPTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_BASE),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_CMD_BUFSZ),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_CMD_BUFSZ),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_CMD_BUFSZ),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_CMD_BUFSZ),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_LO),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_HI),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BUFSZ),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_LO),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_HI),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BUFSZ),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_LO),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_HI),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BUFSZ),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_LO),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_HI),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BUFSZ),
SOC15_REG_ENTRY_STR(GC, 0, mmCPF_UTCL1_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmCPC_UTCL1_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmCPG_UTCL1_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmGDS_PROTECTION_FAULT),
SOC15_REG_ENTRY_STR(GC, 0, mmGDS_VM_PROTECTION_FAULT),
SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS_2),
SOC15_REG_ENTRY_STR(GC, 0, mmPA_CL_CNTL_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_UTCL1_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmRMI_UTCL1_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmSQC_DCACHE_UTCL0_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmSQC_ICACHE_UTCL0_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmSQG_UTCL0_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmTCP_UTCL0_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmWD_UTCL1_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL),
SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_DEBUG),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_CNTL),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_CNTL),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_INSTR_PNTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC1_INSTR_PNTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC2_INSTR_PNTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_INSTR_PNTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_ME_INSTR_PNTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_PFP_INSTR_PNTR),
SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_STAT),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_COMMAND),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_MESSAGE),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_1),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_2),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_3),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_4),
SOC15_REG_ENTRY_STR(GC, 0, mmSMU_RLC_RESPONSE),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SAFE_MODE),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_SAFE_MODE),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_GPM_STAT_2),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SPP_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_BOOTLOAD_STATUS),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_INT_STAT),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_GENERAL_6),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_A),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_B),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_ADDR),
SOC15_REG_ENTRY_STR(GC, 0, mmRLC_LX6_CORE_PDEBUG_INST)
};
static const struct soc15_reg_golden golden_settings_gc_10_1[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmCB_HW_CONTROL_4, 0xffffffff, 0x00400014),
SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_CPF_CLK_CTRL, 0xfcff8fff, 0xf8000100),
@@ -3964,7 +4057,7 @@ static void gfx_v10_0_check_gfxoff_flag(struct amdgpu_device *adev)
static int gfx_v10_0_init_microcode(struct amdgpu_device *adev)
{
char fw_name[40];
char fw_name[53];
char ucode_prefix[30];
const char *wks = "";
int err;
@@ -4490,6 +4583,22 @@ static int gfx_v10_0_compute_ring_init(struct amdgpu_device *adev, int ring_id,
hw_prio, NULL);
}
static void gfx_v10_0_alloc_dump_mem(struct amdgpu_device *adev)
{
uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
uint32_t *ptr;
ptr = kcalloc(reg_count, sizeof(uint32_t), GFP_KERNEL);
if (ptr == NULL) {
DRM_ERROR("Failed to allocate memory for IP Dump\n");
adev->gfx.ip_dump = NULL;
adev->gfx.reg_count = 0;
} else {
adev->gfx.ip_dump = ptr;
adev->gfx.reg_count = reg_count;
}
}
static int gfx_v10_0_sw_init(void *handle)
{
int i, j, k, r, ring_id = 0;
@@ -4518,7 +4627,7 @@ static int gfx_v10_0_sw_init(void *handle)
case IP_VERSION(10, 3, 3):
case IP_VERSION(10, 3, 7):
adev->gfx.me.num_me = 1;
adev->gfx.me.num_pipe_per_me = 1;
adev->gfx.me.num_pipe_per_me = 2;
adev->gfx.me.num_queue_per_pipe = 1;
adev->gfx.mec.num_mec = 2;
adev->gfx.mec.num_pipe_per_mec = 4;
@@ -4642,6 +4751,8 @@ static int gfx_v10_0_sw_init(void *handle)
gfx_v10_0_gpu_early_init(adev);
gfx_v10_0_alloc_dump_mem(adev);
return 0;
}
@@ -4694,6 +4805,8 @@ static int gfx_v10_0_sw_fini(void *handle)
gfx_v10_0_free_microcode(adev);
kfree(adev->gfx.ip_dump);
return 0;
}
@@ -8317,7 +8430,7 @@ static void gfx_v10_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
}
reg_mem_engine = 0;
} else {
ref_and_mask = nbio_hf_reg->ref_and_mask_cp0;
ref_and_mask = nbio_hf_reg->ref_and_mask_cp0 << ring->pipe;
reg_mem_engine = 1; /* pfp */
}
@@ -9154,6 +9267,36 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
}
static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
uint32_t i;
uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
if (!adev->gfx.ip_dump)
return;
for (i = 0; i < reg_count; i++)
drm_printf(p, "%-50s \t 0x%08x\n",
gc_reg_list_10_1[i].reg_name,
adev->gfx.ip_dump[i]);
}
static void gfx_v10_ip_dump(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
uint32_t i;
uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1);
if (!adev->gfx.ip_dump)
return;
amdgpu_gfx_off_ctrl(adev, false);
for (i = 0; i < reg_count; i++)
adev->gfx.ip_dump[i] = RREG32(SOC15_REG_ENTRY_OFFSET(gc_reg_list_10_1[i]));
amdgpu_gfx_off_ctrl(adev, true);
}
static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {
.name = "gfx_v10_0",
.early_init = gfx_v10_0_early_init,
@@ -9170,6 +9313,8 @@ static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {
.set_clockgating_state = gfx_v10_0_set_clockgating_state,
.set_powergating_state = gfx_v10_0_set_powergating_state,
.get_clockgating_state = gfx_v10_0_get_clockgating_state,
.dump_ip_state = gfx_v10_ip_dump,
.print_ip_state = gfx_v10_ip_print,
};
static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {

View File

@@ -510,7 +510,7 @@ static void gfx_v11_0_check_fw_cp_gfx_shadow(struct amdgpu_device *adev)
static int gfx_v11_0_init_microcode(struct amdgpu_device *adev)
{
char fw_name[40];
char ucode_prefix[30];
char ucode_prefix[25];
int err;
const struct rlc_firmware_header_v2_0 *rlc_hdr;
uint16_t version_major;
@@ -4506,14 +4506,11 @@ static int gfx_v11_0_soft_reset(void *handle)
gfx_v11_0_set_safe_mode(adev, 0);
mutex_lock(&adev->srbm_mutex);
for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) {
for (k = 0; k < adev->gfx.mec.num_pipe_per_mec; k++) {
tmp = RREG32_SOC15(GC, 0, regGRBM_GFX_CNTL);
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, MEID, i);
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, QUEUEID, j);
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, PIPEID, k);
WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, tmp);
soc21_grbm_select(adev, i, k, j, 0);
WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2);
WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1);
@@ -4523,16 +4520,14 @@ static int gfx_v11_0_soft_reset(void *handle)
for (i = 0; i < adev->gfx.me.num_me; ++i) {
for (j = 0; j < adev->gfx.me.num_queue_per_pipe; j++) {
for (k = 0; k < adev->gfx.me.num_pipe_per_me; k++) {
tmp = RREG32_SOC15(GC, 0, regGRBM_GFX_CNTL);
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, MEID, i);
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, QUEUEID, j);
tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, PIPEID, k);
WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, tmp);
soc21_grbm_select(adev, i, k, j, 0);
WREG32_SOC15(GC, 0, regCP_GFX_HQD_DEQUEUE_REQUEST, 0x1);
}
}
}
soc21_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
/* Try to acquire the gfx mutex before access to CP_VMID_RESET */
r = gfx_v11_0_request_gfx_index_mutex(adev, 1);
@@ -6174,6 +6169,8 @@ static const struct amd_ip_funcs gfx_v11_0_ip_funcs = {
.set_clockgating_state = gfx_v11_0_set_clockgating_state,
.set_powergating_state = gfx_v11_0_set_powergating_state,
.get_clockgating_state = gfx_v11_0_get_clockgating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_gfx = {

View File

@@ -3457,6 +3457,8 @@ static const struct amd_ip_funcs gfx_v6_0_ip_funcs = {
.soft_reset = gfx_v6_0_soft_reset,
.set_clockgating_state = gfx_v6_0_set_clockgating_state,
.set_powergating_state = gfx_v6_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs gfx_v6_0_ring_funcs_gfx = {

View File

@@ -4977,6 +4977,8 @@ static const struct amd_ip_funcs gfx_v7_0_ip_funcs = {
.soft_reset = gfx_v7_0_soft_reset,
.set_clockgating_state = gfx_v7_0_set_clockgating_state,
.set_powergating_state = gfx_v7_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {

View File

@@ -6878,6 +6878,8 @@ static const struct amd_ip_funcs gfx_v8_0_ip_funcs = {
.set_clockgating_state = gfx_v8_0_set_clockgating_state,
.set_powergating_state = gfx_v8_0_set_powergating_state,
.get_clockgating_state = gfx_v8_0_get_clockgating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {

View File

@@ -1249,7 +1249,7 @@ static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev)
static int gfx_v9_0_init_cp_gfx_microcode(struct amdgpu_device *adev,
char *chip_name)
{
char fw_name[30];
char fw_name[50];
int err;
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_pfp.bin", chip_name);
@@ -1282,7 +1282,7 @@ out:
static int gfx_v9_0_init_rlc_microcode(struct amdgpu_device *adev,
char *chip_name)
{
char fw_name[30];
char fw_name[53];
int err;
const struct rlc_firmware_header_v2_0 *rlc_hdr;
uint16_t version_major;
@@ -1337,7 +1337,7 @@ static bool gfx_v9_0_load_mec2_fw_bin_support(struct amdgpu_device *adev)
static int gfx_v9_0_init_cp_compute_microcode(struct amdgpu_device *adev,
char *chip_name)
{
char fw_name[30];
char fw_name[50];
int err;
if (amdgpu_sriov_vf(adev) && (adev->asic_type == CHIP_ALDEBARAN))
@@ -6856,6 +6856,8 @@ static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
.set_clockgating_state = gfx_v9_0_set_clockgating_state,
.set_powergating_state = gfx_v9_0_set_powergating_state,
.get_clockgating_state = gfx_v9_0_get_clockgating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {

View File

@@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
mutex_unlock(&adev->grbm_idx_mutex);
}
static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev)
{
u32 status = 0;
struct amdgpu_vmhub *hub;
hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
status = RREG32(hub->vm_l2_pro_fault_status);
/* reset page fault status */
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
}
struct amdgpu_ras_block_hw_ops gfx_v9_4_2_ras_ops = {
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
.hw_ops = &gfx_v9_4_2_ras_ops,
},
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
.query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
};

View File

@@ -431,16 +431,16 @@ out:
static int gfx_v9_4_3_init_microcode(struct amdgpu_device *adev)
{
const char *chip_name;
char ucode_prefix[15];
int r;
chip_name = "gc_9_4_3";
amdgpu_ucode_ip_version_decode(adev, GC_HWIP, ucode_prefix, sizeof(ucode_prefix));
r = gfx_v9_4_3_init_rlc_microcode(adev, chip_name);
r = gfx_v9_4_3_init_rlc_microcode(adev, ucode_prefix);
if (r)
return r;
r = gfx_v9_4_3_init_cp_compute_microcode(adev, chip_name);
r = gfx_v9_4_3_init_cp_compute_microcode(adev, ucode_prefix);
if (r)
return r;
@@ -680,38 +680,44 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_3_gfx_funcs = {
.ih_node_to_logical_xcc = &gfx_v9_4_3_ih_to_xcc_inst,
};
static int gfx_v9_4_3_aca_bank_generate_report(struct aca_handle *handle,
struct aca_bank *bank, enum aca_error_type type,
struct aca_bank_report *report, void *data)
static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
struct aca_bank *bank, enum aca_smu_type type,
void *data)
{
u64 status, misc0;
struct aca_bank_info info;
u64 misc0;
u32 instlo;
int ret;
status = bank->regs[ACA_REG_IDX_STATUS];
if ((type == ACA_ERROR_TYPE_UE &&
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
(type == ACA_ERROR_TYPE_CE &&
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
ret = aca_bank_info_decode(bank, &info);
if (ret)
return ret;
ret = aca_bank_info_decode(bank, &report->info);
if (ret)
return ret;
/* NOTE: overwrite info.die_id with xcd id for gfx */
instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
instlo &= GENMASK(31, 1);
info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
/* NOTE: overwrite info.die_id with xcd id for gfx */
instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]);
instlo &= GENMASK(31, 1);
report->info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1;
misc0 = bank->regs[ACA_REG_IDX_MISC0];
misc0 = bank->regs[ACA_REG_IDX_MISC0];
report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
switch (type) {
case ACA_SMU_TYPE_UE:
ret = aca_error_cache_log_bank_error(handle, &info,
ACA_ERROR_TYPE_UE, 1ULL);
break;
case ACA_SMU_TYPE_CE:
ret = aca_error_cache_log_bank_error(handle, &info,
ACA_ERROR_TYPE_CE, ACA_REG__MISC0__ERRCNT(misc0));
break;
default:
return -EINVAL;
}
return 0;
return ret;
}
static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
enum aca_error_type type, void *data)
enum aca_smu_type type, void *data)
{
u32 instlo;
@@ -730,7 +736,7 @@ static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_b
}
static const struct aca_bank_ops gfx_v9_4_3_aca_bank_ops = {
.aca_bank_generate_report = gfx_v9_4_3_aca_bank_generate_report,
.aca_bank_parser = gfx_v9_4_3_aca_bank_parser,
.aca_bank_is_valid = gfx_v9_4_3_aca_bank_is_valid,
};
@@ -2398,10 +2404,10 @@ gfx_v9_4_3_xcc_update_coarse_grain_clock_gating(struct amdgpu_device *adev,
if (def != data)
WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data);
/* enable cgcg FSM(0x0000363F) */
/* CGCG Hysteresis: 400us */
def = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL);
data = (0x36
data = (0x2710
<< RLC_CGCG_CGLS_CTRL__CGCG_GFX_IDLE_THRESHOLD__SHIFT) |
RLC_CGCG_CGLS_CTRL__CGCG_EN_MASK;
if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGLS)
@@ -2410,10 +2416,10 @@ gfx_v9_4_3_xcc_update_coarse_grain_clock_gating(struct amdgpu_device *adev,
if (def != data)
WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL, data);
/* set IDLE_POLL_COUNT(0x00900100) */
/* set IDLE_POLL_COUNT(0x33450100)*/
def = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL);
data = (0x0100 << CP_RB_WPTR_POLL_CNTL__POLL_FREQUENCY__SHIFT) |
(0x0090 << CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT__SHIFT);
(0x3345 << CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT__SHIFT);
if (def != data)
WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL, data);
} else {
@@ -4010,6 +4016,8 @@ static const struct amd_ip_funcs gfx_v9_4_3_ip_funcs = {
.set_clockgating_state = gfx_v9_4_3_set_clockgating_state,
.set_powergating_state = gfx_v9_4_3_set_powergating_state,
.get_clockgating_state = gfx_v9_4_3_get_clockgating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs gfx_v9_4_3_ring_funcs_compute = {

View File

@@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev)
mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32;
}
static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev,
int xcc_id)
{
u32 status = 0;
struct amdgpu_vmhub *hub;
if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2))
return false;
hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
status = RREG32(hub->vm_l2_pro_fault_status);
/* reset page fault status */
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
}
const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset,
@@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {
.set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default,
.init = gfxhub_v1_0_init,
.get_xgmi_info = gfxhub_v1_1_get_xgmi_info,
.query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status,
};

View File

@@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device *adev)
return 0;
}
static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev,
int xcc_id)
{
u32 fed, status;
status = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regVM_L2_PROTECTION_FAULT_STATUS);
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
/* reset page fault status */
WREG32_P(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id),
regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
return fed;
}
const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
.get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset,
.setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs,
@@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {
.set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default,
.init = gfxhub_v1_2_init,
.get_xgmi_info = gfxhub_v1_2_get_xgmi_info,
.query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status,
};
static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask)

View File

@@ -1115,6 +1115,8 @@ static const struct amd_ip_funcs gmc_v6_0_ip_funcs = {
.soft_reset = gmc_v6_0_soft_reset,
.set_clockgating_state = gmc_v6_0_set_clockgating_state,
.set_powergating_state = gmc_v6_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_gmc_funcs gmc_v6_0_gmc_funcs = {

View File

@@ -1354,6 +1354,8 @@ static const struct amd_ip_funcs gmc_v7_0_ip_funcs = {
.soft_reset = gmc_v7_0_soft_reset,
.set_clockgating_state = gmc_v7_0_set_clockgating_state,
.set_powergating_state = gmc_v7_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_gmc_funcs gmc_v7_0_gmc_funcs = {

View File

@@ -1717,6 +1717,8 @@ static const struct amd_ip_funcs gmc_v8_0_ip_funcs = {
.set_clockgating_state = gmc_v8_0_set_clockgating_state,
.set_powergating_state = gmc_v8_0_set_powergating_state,
.get_clockgating_state = gmc_v8_0_get_clockgating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_gmc_funcs gmc_v8_0_gmc_funcs = {

View File

@@ -548,7 +548,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
{
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
uint32_t status = 0, cid = 0, rw = 0;
uint32_t status = 0, cid = 0, rw = 0, fed = 0;
struct amdgpu_task_info *task_info;
struct amdgpu_vmhub *hub;
const char *mmhub_cid;
@@ -664,6 +664,13 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
status = RREG32(hub->vm_l2_pro_fault_status);
cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
/* for fed error, kfd will handle it, return directly */
if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
(amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2)))
return 0;
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
@@ -1450,7 +1457,6 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)
adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET;
adev->umc.active_mask = adev->aid_mask;
adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL;
adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0];
if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
adev->umc.ras = &umc_v12_0_ras;
break;

View File

@@ -425,6 +425,8 @@ static const struct amd_ip_funcs iceland_ih_ip_funcs = {
.soft_reset = iceland_ih_soft_reset,
.set_clockgating_state = iceland_ih_set_clockgating_state,
.set_powergating_state = iceland_ih_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ih_funcs iceland_ih_funcs = {

View File

@@ -346,6 +346,21 @@ static int ih_v6_0_irq_init(struct amdgpu_device *adev)
DELAY, 3);
WREG32_SOC15(OSSSYS, 0, regIH_MSI_STORM_CTRL, tmp);
/* Redirect the interrupts to IH RB1 for dGPU */
if (adev->irq.ih1.ring_size) {
tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX);
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_INDEX, INDEX, 0);
WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX, tmp);
tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA);
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, CLIENT_ID, 0xa);
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, SOURCE_ID, 0x0);
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA,
SOURCE_ID_MATCH_ENABLE, 0x1);
WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA, tmp);
}
pci_set_master(adev->pdev);
/* enable interrupts */
@@ -549,8 +564,15 @@ static int ih_v6_0_sw_init(void *handle)
adev->irq.ih.use_doorbell = true;
adev->irq.ih.doorbell_index = adev->doorbell_index.ih << 1;
adev->irq.ih1.ring_size = 0;
adev->irq.ih2.ring_size = 0;
if (!(adev->flags & AMD_IS_APU)) {
r = amdgpu_ih_ring_init(adev, &adev->irq.ih1, IH_RING_SIZE,
use_bus_addr);
if (r)
return r;
adev->irq.ih1.use_doorbell = true;
adev->irq.ih1.doorbell_index = (adev->doorbell_index.ih + 1) << 1;
}
/* initialize ih control register offset */
ih_v6_0_init_register_offset(adev);
@@ -748,6 +770,8 @@ static const struct amd_ip_funcs ih_v6_0_ip_funcs = {
.set_clockgating_state = ih_v6_0_set_clockgating_state,
.set_powergating_state = ih_v6_0_set_powergating_state,
.get_clockgating_state = ih_v6_0_get_clockgating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ih_funcs ih_v6_0_funcs = {

View File

@@ -346,6 +346,21 @@ static int ih_v6_1_irq_init(struct amdgpu_device *adev)
DELAY, 3);
WREG32_SOC15(OSSSYS, 0, regIH_MSI_STORM_CTRL, tmp);
/* Redirect the interrupts to IH RB1 for dGPU */
if (adev->irq.ih1.ring_size) {
tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX);
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_INDEX, INDEX, 0);
WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX, tmp);
tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA);
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, CLIENT_ID, 0xa);
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, SOURCE_ID, 0x0);
tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA,
SOURCE_ID_MATCH_ENABLE, 0x1);
WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA, tmp);
}
pci_set_master(adev->pdev);
/* enable interrupts */
@@ -550,8 +565,15 @@ static int ih_v6_1_sw_init(void *handle)
adev->irq.ih.use_doorbell = true;
adev->irq.ih.doorbell_index = adev->doorbell_index.ih << 1;
adev->irq.ih1.ring_size = 0;
adev->irq.ih2.ring_size = 0;
if (!(adev->flags & AMD_IS_APU)) {
r = amdgpu_ih_ring_init(adev, &adev->irq.ih1, IH_RING_SIZE,
use_bus_addr);
if (r)
return r;
adev->irq.ih1.use_doorbell = true;
adev->irq.ih1.doorbell_index = (adev->doorbell_index.ih + 1) << 1;
}
/* initialize ih control register offset */
ih_v6_1_init_register_offset(adev);
@@ -753,6 +775,8 @@ static const struct amd_ip_funcs ih_v6_1_ip_funcs = {
.set_clockgating_state = ih_v6_1_set_clockgating_state,
.set_powergating_state = ih_v6_1_set_powergating_state,
.get_clockgating_state = ih_v6_1_get_clockgating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ih_funcs ih_v6_1_funcs = {

View File

@@ -749,6 +749,8 @@ static const struct amd_ip_funcs ih_v7_0_ip_funcs = {
.set_clockgating_state = ih_v7_0_set_clockgating_state,
.set_powergating_state = ih_v7_0_set_powergating_state,
.get_clockgating_state = ih_v7_0_get_clockgating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ih_funcs ih_v7_0_funcs = {

View File

@@ -759,6 +759,8 @@ static const struct amd_ip_funcs jpeg_v2_0_ip_funcs = {
.post_soft_reset = NULL,
.set_clockgating_state = jpeg_v2_0_set_clockgating_state,
.set_powergating_state = jpeg_v2_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs jpeg_v2_0_dec_ring_vm_funcs = {

View File

@@ -632,6 +632,8 @@ static const struct amd_ip_funcs jpeg_v2_5_ip_funcs = {
.post_soft_reset = NULL,
.set_clockgating_state = jpeg_v2_5_set_clockgating_state,
.set_powergating_state = jpeg_v2_5_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amd_ip_funcs jpeg_v2_6_ip_funcs = {
@@ -652,6 +654,8 @@ static const struct amd_ip_funcs jpeg_v2_6_ip_funcs = {
.post_soft_reset = NULL,
.set_clockgating_state = jpeg_v2_5_set_clockgating_state,
.set_powergating_state = jpeg_v2_5_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs jpeg_v2_5_dec_ring_vm_funcs = {

View File

@@ -557,6 +557,8 @@ static const struct amd_ip_funcs jpeg_v3_0_ip_funcs = {
.post_soft_reset = NULL,
.set_clockgating_state = jpeg_v3_0_set_clockgating_state,
.set_powergating_state = jpeg_v3_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs jpeg_v3_0_dec_ring_vm_funcs = {

View File

@@ -719,6 +719,8 @@ static const struct amd_ip_funcs jpeg_v4_0_ip_funcs = {
.post_soft_reset = NULL,
.set_clockgating_state = jpeg_v4_0_set_clockgating_state,
.set_powergating_state = jpeg_v4_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs jpeg_v4_0_dec_ring_vm_funcs = {

View File

@@ -1053,6 +1053,8 @@ static const struct amd_ip_funcs jpeg_v4_0_3_ip_funcs = {
.post_soft_reset = NULL,
.set_clockgating_state = jpeg_v4_0_3_set_clockgating_state,
.set_powergating_state = jpeg_v4_0_3_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs jpeg_v4_0_3_dec_ring_vm_funcs = {

View File

@@ -762,6 +762,8 @@ static const struct amd_ip_funcs jpeg_v4_0_5_ip_funcs = {
.post_soft_reset = NULL,
.set_clockgating_state = jpeg_v4_0_5_set_clockgating_state,
.set_powergating_state = jpeg_v4_0_5_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs jpeg_v4_0_5_dec_ring_vm_funcs = {

View File

@@ -513,6 +513,8 @@ static const struct amd_ip_funcs jpeg_v5_0_0_ip_funcs = {
.post_soft_reset = NULL,
.set_clockgating_state = jpeg_v5_0_0_set_clockgating_state,
.set_powergating_state = jpeg_v5_0_0_set_powergating_state,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
static const struct amdgpu_ring_funcs jpeg_v5_0_0_dec_ring_vm_funcs = {

View File

@@ -1176,6 +1176,8 @@ static const struct amd_ip_funcs mes_v10_1_ip_funcs = {
.hw_fini = mes_v10_1_hw_fini,
.suspend = mes_v10_1_suspend,
.resume = mes_v10_1_resume,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
const struct amdgpu_ip_block_version mes_v10_1_ip_block = {

View File

@@ -100,18 +100,76 @@ static const struct amdgpu_ring_funcs mes_v11_0_ring_funcs = {
.insert_nop = amdgpu_ring_insert_nop,
};
static const char *mes_v11_0_opcodes[] = {
"SET_HW_RSRC",
"SET_SCHEDULING_CONFIG",
"ADD_QUEUE",
"REMOVE_QUEUE",
"PERFORM_YIELD",
"SET_GANG_PRIORITY_LEVEL",
"SUSPEND",
"RESUME",
"RESET",
"SET_LOG_BUFFER",
"CHANGE_GANG_PRORITY",
"QUERY_SCHEDULER_STATUS",
"PROGRAM_GDS",
"SET_DEBUG_VMID",
"MISC",
"UPDATE_ROOT_PAGE_TABLE",
"AMD_LOG",
};
static const char *mes_v11_0_misc_opcodes[] = {
"WRITE_REG",
"INV_GART",
"QUERY_STATUS",
"READ_REG",
"WAIT_REG_MEM",
"SET_SHADER_DEBUGGER",
};
static const char *mes_v11_0_get_op_string(union MESAPI__MISC *x_pkt)
{
const char *op_str = NULL;
if (x_pkt->header.opcode < ARRAY_SIZE(mes_v11_0_opcodes))
op_str = mes_v11_0_opcodes[x_pkt->header.opcode];
return op_str;
}
static const char *mes_v11_0_get_misc_op_string(union MESAPI__MISC *x_pkt)
{
const char *op_str = NULL;
if ((x_pkt->header.opcode == MES_SCH_API_MISC) &&
(x_pkt->opcode < ARRAY_SIZE(mes_v11_0_misc_opcodes)))
op_str = mes_v11_0_misc_opcodes[x_pkt->opcode];
return op_str;
}
static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
void *pkt, int size,
int api_status_off)
{
int ndw = size / 4;
signed long r;
union MESAPI__ADD_QUEUE *x_pkt = pkt;
union MESAPI__MISC *x_pkt = pkt;
struct MES_API_STATUS *api_status;
struct amdgpu_device *adev = mes->adev;
struct amdgpu_ring *ring = &mes->ring;
unsigned long flags;
signed long timeout = adev->usec_timeout;
signed long timeout = 3000000; /* 3000 ms */
const char *op_str, *misc_op_str;
u32 fence_offset;
u64 fence_gpu_addr;
u64 *fence_ptr;
int ret;
if (x_pkt->header.opcode >= MES_SCH_API_MAX)
return -EINVAL;
if (amdgpu_emu_mode) {
timeout *= 100;
@@ -121,27 +179,52 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
}
BUG_ON(size % 4 != 0);
ret = amdgpu_device_wb_get(adev, &fence_offset);
if (ret)
return ret;
fence_gpu_addr =
adev->wb.gpu_addr + (fence_offset * 4);
fence_ptr = (u64 *)&adev->wb.wb[fence_offset];
*fence_ptr = 0;
spin_lock_irqsave(&mes->ring_lock, flags);
if (amdgpu_ring_alloc(ring, ndw)) {
spin_unlock_irqrestore(&mes->ring_lock, flags);
amdgpu_device_wb_free(adev, fence_offset);
return -ENOMEM;
}
api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off);
api_status->api_completion_fence_addr = mes->ring.fence_drv.gpu_addr;
api_status->api_completion_fence_value = ++mes->ring.fence_drv.sync_seq;
api_status->api_completion_fence_addr = fence_gpu_addr;
api_status->api_completion_fence_value = 1;
amdgpu_ring_write_multiple(ring, pkt, ndw);
amdgpu_ring_commit(ring);
spin_unlock_irqrestore(&mes->ring_lock, flags);
DRM_DEBUG("MES msg=%d was emitted\n", x_pkt->header.opcode);
op_str = mes_v11_0_get_op_string(x_pkt);
misc_op_str = mes_v11_0_get_misc_op_string(x_pkt);
r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
timeout);
if (misc_op_str)
dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str);
else if (op_str)
dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str);
else
dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode);
r = amdgpu_mes_fence_wait_polling(fence_ptr, (u64)1, timeout);
amdgpu_device_wb_free(adev, fence_offset);
if (r < 1) {
DRM_ERROR("MES failed to response msg=%d\n",
x_pkt->header.opcode);
if (misc_op_str)
dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n",
op_str, misc_op_str);
else if (op_str)
dev_err(adev->dev, "MES failed to respond to msg=%s\n",
op_str);
else
dev_err(adev->dev, "MES failed to respond to msg=%d\n",
x_pkt->header.opcode);
while (halt_if_hws_hang)
schedule();
@@ -422,6 +505,36 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
offsetof(union MESAPI_SET_HW_RESOURCES, api_status));
}
static int mes_v11_0_set_hw_resources_1(struct amdgpu_mes *mes)
{
int size = 128 * PAGE_SIZE;
int ret = 0;
struct amdgpu_device *adev = mes->adev;
union MESAPI_SET_HW_RESOURCES_1 mes_set_hw_res_pkt;
memset(&mes_set_hw_res_pkt, 0, sizeof(mes_set_hw_res_pkt));
mes_set_hw_res_pkt.header.type = MES_API_TYPE_SCHEDULER;
mes_set_hw_res_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC_1;
mes_set_hw_res_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
mes_set_hw_res_pkt.enable_mes_info_ctx = 1;
ret = amdgpu_bo_create_kernel(adev, size, PAGE_SIZE,
AMDGPU_GEM_DOMAIN_VRAM,
&mes->resource_1,
&mes->resource_1_gpu_addr,
&mes->resource_1_addr);
if (ret) {
dev_err(adev->dev, "(%d) failed to create mes resource_1 bo\n", ret);
return ret;
}
mes_set_hw_res_pkt.mes_info_ctx_mc_addr = mes->resource_1_gpu_addr;
mes_set_hw_res_pkt.mes_info_ctx_size = mes->resource_1->tbo.base.size;
return mes_v11_0_submit_pkt_and_poll_completion(mes,
&mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status));
}
static const struct amdgpu_mes_funcs mes_v11_0_funcs = {
.add_hw_queue = mes_v11_0_add_hw_queue,
.remove_hw_queue = mes_v11_0_remove_hw_queue,
@@ -1203,6 +1316,14 @@ static int mes_v11_0_hw_init(void *handle)
if (r)
goto failure;
if (amdgpu_sriov_is_mes_info_enable(adev)) {
r = mes_v11_0_set_hw_resources_1(&adev->mes);
if (r) {
DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r);
goto failure;
}
}
r = mes_v11_0_query_sched_status(&adev->mes);
if (r) {
DRM_ERROR("MES is busy\n");
@@ -1226,6 +1347,11 @@ failure:
static int mes_v11_0_hw_fini(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
if (amdgpu_sriov_is_mes_info_enable(adev)) {
amdgpu_bo_free_kernel(&adev->mes.resource_1, &adev->mes.resource_1_gpu_addr,
&adev->mes.resource_1_addr);
}
return 0;
}
@@ -1291,6 +1417,8 @@ static const struct amd_ip_funcs mes_v11_0_ip_funcs = {
.hw_fini = mes_v11_0_hw_fini,
.suspend = mes_v11_0_suspend,
.resume = mes_v11_0_resume,
.dump_ip_state = NULL,
.print_ip_state = NULL,
};
const struct amdgpu_ip_block_version mes_v11_0_ip_block = {

View File

@@ -559,6 +559,20 @@ static void mmhub_v1_8_get_clockgating(struct amdgpu_device *adev, u64 *flags)
}
static bool mmhub_v1_8_query_utcl2_poison_status(struct amdgpu_device *adev,
int hub_inst)
{
u32 fed, status;
status = RREG32_SOC15(MMHUB, hub_inst, regVM_L2_PROTECTION_FAULT_STATUS);
fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
/* reset page fault status */
WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst,
regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1);
return fed;
}
const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
.get_fb_location = mmhub_v1_8_get_fb_location,
.init = mmhub_v1_8_init,
@@ -568,6 +582,7 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {
.setup_vm_pt_regs = mmhub_v1_8_setup_vm_pt_regs,
.set_clockgating = mmhub_v1_8_set_clockgating,
.get_clockgating = mmhub_v1_8_get_clockgating,
.query_utcl2_poison_status = mmhub_v1_8_query_utcl2_poison_status,
};
static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = {
@@ -706,28 +721,32 @@ static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = {
.reset_ras_error_count = mmhub_v1_8_reset_ras_error_count,
};
static int mmhub_v1_8_aca_bank_generate_report(struct aca_handle *handle,
struct aca_bank *bank, enum aca_error_type type,
struct aca_bank_report *report, void *data)
static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank,
enum aca_smu_type type, void *data)
{
u64 status, misc0;
struct aca_bank_info info;
u64 misc0;
int ret;
status = bank->regs[ACA_REG_IDX_STATUS];
if ((type == ACA_ERROR_TYPE_UE &&
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) ||
(type == ACA_ERROR_TYPE_CE &&
ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) {
ret = aca_bank_info_decode(bank, &info);
if (ret)
return ret;
ret = aca_bank_info_decode(bank, &report->info);
if (ret)
return ret;
misc0 = bank->regs[ACA_REG_IDX_MISC0];
report->count[type] = ACA_REG__MISC0__ERRCNT(misc0);
misc0 = bank->regs[ACA_REG_IDX_MISC0];
switch (type) {
case ACA_SMU_TYPE_UE:
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE,
1ULL);
break;
case ACA_SMU_TYPE_CE:
ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE,
ACA_REG__MISC0__ERRCNT(misc0));
break;
default:
return -EINVAL;
}
return 0;
return ret;
}
/* reference to smu driver if header file */
@@ -741,7 +760,7 @@ static int mmhub_v1_8_err_codes[] = {
};
static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank,
enum aca_error_type type, void *data)
enum aca_smu_type type, void *data)
{
u32 instlo;
@@ -760,7 +779,7 @@ static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_b
}
static const struct aca_bank_ops mmhub_v1_8_aca_bank_ops = {
.aca_bank_generate_report = mmhub_v1_8_aca_bank_generate_report,
.aca_bank_parser = mmhub_v1_8_aca_bank_parser,
.aca_bank_is_valid = mmhub_v1_8_aca_bank_is_valid,
};

Some files were not shown because too many files have changed in this diff Show More