Commit b5cddeb0 authored by Sharley Calzolari's avatar Sharley Calzolari Committed by Koby Elbaz
Browse files

accel/habanalabs/gaudi2: add support for logging register accesses from debugfs



Add infrastructure for logging the last configuration register accesses
that occur via debugfs read/write operations. At interrupt time, these
log entries can be dumped to dmesg, which helps in diagnosing the cause
of RAZWI and ADDR_DEC interrupts.

The logging is implemented as a ring buffer of access entries, with each
entry recording timestamp and access details. To ensure correctness
under concurrent access, operations are now protected using spinlocks.
Entries are copied under lock and then printed after releasing it, which
minimizes time spent in the critical section.

Signed-off-by: default avatarSharley Calzolari <sharley.calzolari@intel.com>
Reviewed-by: default avatarKoby Elbaz <koby.elbaz@intel.com>
Signed-off-by: default avatarKoby Elbaz <koby.elbaz@intel.com>
parent 214e26a4
Loading
Loading
Loading
Loading
+111 −0
Original line number Diff line number Diff line
@@ -788,6 +788,113 @@ static void hl_access_host_mem(struct hl_device *hdev, u64 addr, u64 *val,
	}
}

static void dump_cfg_access_entry(struct hl_device *hdev,
				  struct hl_debugfs_cfg_access_entry *entry)
{
	char *access_type = "";
	struct tm tm;

	switch (entry->debugfs_type) {
	case DEBUGFS_READ32:
		access_type = "READ32 from";
		break;
	case DEBUGFS_WRITE32:
		access_type = "WRITE32 to";
		break;
	case DEBUGFS_READ64:
		access_type = "READ64 from";
		break;
	case DEBUGFS_WRITE64:
		access_type = "WRITE64 to";
		break;
	default:
		dev_err(hdev->dev, "Invalid DEBUGFS access type (%u)\n", entry->debugfs_type);
		return;
	}

	time64_to_tm(entry->seconds_since_epoch, 0, &tm);
	dev_info(hdev->dev,
		"%ld-%02d-%02d %02d:%02d:%02d (UTC): %s %#llx\n", tm.tm_year + 1900, tm.tm_mon + 1,
		tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, access_type, entry->addr);
}

void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev)
{
	struct hl_debugfs_cfg_access *dbgfs = &hdev->debugfs_cfg_accesses;
	u32 i, head, count = 0;
	time64_t entry_time, now;
	unsigned long flags;

	now = ktime_get_real_seconds();

	spin_lock_irqsave(&dbgfs->lock, flags);
	head = dbgfs->head;
	if (head == 0)
		i = HL_DBGFS_CFG_ACCESS_HIST_LEN - 1;
	else
		i = head - 1;

	/* Walk back until timeout or invalid entry */
	while (dbgfs->cfg_access_list[i].valid) {
		entry_time = dbgfs->cfg_access_list[i].seconds_since_epoch;
		/* Stop when entry is older than timeout */
		if (now - entry_time > HL_DBGFS_CFG_ACCESS_HIST_TIMEOUT_SEC)
			break;

		/* print single entry under lock */
		{
			struct hl_debugfs_cfg_access_entry entry = dbgfs->cfg_access_list[i];
			/*
			 * We copy the entry out under lock and then print after
			 * releasing the lock to minimize time under lock.
			 */
			spin_unlock_irqrestore(&dbgfs->lock, flags);
			dump_cfg_access_entry(hdev, &entry);
			spin_lock_irqsave(&dbgfs->lock, flags);
		}

		/* mark consumed */
		dbgfs->cfg_access_list[i].valid = false;

		if (i == 0)
			i = HL_DBGFS_CFG_ACCESS_HIST_LEN - 1;
		else
			i--;
		count++;
		if (count >= HL_DBGFS_CFG_ACCESS_HIST_LEN)
			break;
	}
	spin_unlock_irqrestore(&dbgfs->lock, flags);
}

static void check_if_cfg_access_and_log(struct hl_device *hdev, u64 addr, size_t access_size,
					enum debugfs_access_type access_type)
{
	struct hl_debugfs_cfg_access *dbgfs_cfg_accesses = &hdev->debugfs_cfg_accesses;
	struct pci_mem_region *mem_reg = &hdev->pci_mem_region[PCI_REGION_CFG];
	struct hl_debugfs_cfg_access_entry *new_entry;
	unsigned long flags;

	/* Check if address is in config memory */
	if (addr >= mem_reg->region_base &&
		mem_reg->region_size >= access_size &&
		addr <= mem_reg->region_base + mem_reg->region_size - access_size) {

		spin_lock_irqsave(&dbgfs_cfg_accesses->lock, flags);

		new_entry = &dbgfs_cfg_accesses->cfg_access_list[dbgfs_cfg_accesses->head];
		new_entry->seconds_since_epoch = ktime_get_real_seconds();
		new_entry->addr = addr;
		new_entry->debugfs_type = access_type;
		new_entry->valid = true;
		dbgfs_cfg_accesses->head = (dbgfs_cfg_accesses->head + 1)
						% HL_DBGFS_CFG_ACCESS_HIST_LEN;

		spin_unlock_irqrestore(&dbgfs_cfg_accesses->lock, flags);

	}
}

static int hl_access_mem(struct hl_device *hdev, u64 addr, u64 *val,
				enum debugfs_access_type acc_type)
{
@@ -805,6 +912,7 @@ static int hl_access_mem(struct hl_device *hdev, u64 addr, u64 *val,
			return rc;
	}

	check_if_cfg_access_and_log(hdev, addr, acc_size, acc_type);
	rc = hl_access_dev_mem_by_region(hdev, addr, val, acc_type, &found);
	if (rc) {
		dev_err(hdev->dev,
@@ -1762,6 +1870,9 @@ int hl_debugfs_device_init(struct hl_device *hdev)
	spin_lock_init(&dev_entry->userptr_spinlock);
	mutex_init(&dev_entry->ctx_mem_hash_mutex);

	spin_lock_init(&hdev->debugfs_cfg_accesses.lock);
	hdev->debugfs_cfg_accesses.head = 0; /* already zero by alloc but explicit init is fine */

	return 0;
}

+36 −1
Original line number Diff line number Diff line
@@ -91,6 +91,8 @@ struct hl_fpriv;
#define HL_COMMON_DEC_INTERRUPT_ID	0xFFE

#define HL_STATE_DUMP_HIST_LEN			5
#define HL_DBGFS_CFG_ACCESS_HIST_LEN		20
#define HL_DBGFS_CFG_ACCESS_HIST_TIMEOUT_SEC	2 /* 2s */

/* Default value for device reset trigger , an invalid value */
#define HL_RESET_TRIGGER_DEFAULT	0xFF
@@ -2436,6 +2438,32 @@ struct hl_dbg_device_entry {
	u8				i2c_len;
};

/**
 * struct hl_debugfs_cfg_access_entry - single debugfs config access object, member of
 * hl_debugfs_cfg_access.
 * @seconds_since_epoch: seconds since January 1, 1970, used for time comparisons.
 * @debugfs_type: the debugfs operation requested, can be READ32, WRITE32, READ64 or WRITE64.
 * @addr: the requested address to access.
 * @valid: if set, this entry has valid data for dumping at interrupt time.
 */
struct hl_debugfs_cfg_access_entry {
	ktime_t				seconds_since_epoch;
	enum debugfs_access_type	debugfs_type;
	u64				addr;
	bool				valid;
};

/**
 * struct hl_debugfs_cfg_access - saves debugfs config region access requests history.
 * @cfg_access_list: list of objects describing config region access requests.
 * @head: next valid index to add new entry to in cfg_access_list.
 */
struct hl_debugfs_cfg_access {
	struct hl_debugfs_cfg_access_entry	cfg_access_list[HL_DBGFS_CFG_ACCESS_HIST_LEN];
	u32					head;
	spinlock_t			lock; /* protects head and entries */
};

/**
 * struct hl_hw_obj_name_entry - single hw object name, member of
 * hl_state_dump_specs
@@ -3281,6 +3309,7 @@ struct eq_heartbeat_debug_info {
 * @hl_chip_info: ASIC's sensors information.
 * @device_status_description: device status description.
 * @hl_debugfs: device's debugfs manager.
 * @debugfs_cfg_accesses: list of last debugfs config region accesses.
 * @cb_pool: list of pre allocated CBs.
 * @cb_pool_lock: protects the CB pool.
 * @internal_cb_pool_virt_addr: internal command buffer pool virtual address.
@@ -3461,6 +3490,7 @@ struct hl_device {
	struct hwmon_chip_info		*hl_chip_info;

	struct hl_dbg_device_entry	hl_debugfs;
	struct hl_debugfs_cfg_access	debugfs_cfg_accesses;

	struct list_head		cb_pool;
	spinlock_t			cb_pool_lock;
@@ -4110,6 +4140,7 @@ void hl_debugfs_add_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev, struct hl_ctx *ctx);
void hl_debugfs_set_state_dump(struct hl_device *hdev, char *data,
					unsigned long length);
void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev);

#else

@@ -4185,6 +4216,10 @@ static inline void hl_debugfs_set_state_dump(struct hl_device *hdev,
{
}

static inline void hl_debugfs_cfg_access_history_dump(struct hl_device *hdev)
{
}

#endif

/* Security */
+1 −0
Original line number Diff line number Diff line
@@ -10610,6 +10610,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
	if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
		hl_handle_critical_hw_err(hdev, event_type, &event_mask);

	hl_debugfs_cfg_access_history_dump(hdev);
	event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
	hl_device_cond_reset(hdev, reset_flags, event_mask);
}