Merge tag 'misc-habanalabs-next-2021-04-10' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next

Oded writes:

This tag contains habanalabs driver changes for v5.13:

- Add support to reset device after the user closes the file descriptor.
  Because we support a single user, we can reset the device (if needs to)
  after a user closes its file descriptor to make sure the device is in
  idle and clean state for the next user.

- Add a new feature to allow the user to wait on interrupt. This is needed
  for future ASICs

- Replace GFP_ATOMIC with GFP_KERNEL wherever possible and add code to
  support failure of allocating with GFP_ATOMIC.

- Update code to support the latest firmware image:
  - More security features are done in the firmware
  - Remove hard-coded assumptions and replace them with values that are
    sent to the firmware on loading.
  - Print device unusable error
  - Reset device in case the communication between driver and firmware
    gets out of sync.
  - Support new PCI device ids for secured GAUDI.

- Expose current power draw through the INFO IOCTL.

- Support resetting the device upon a request from the BMC (through F/W).

- Always use only a single MSI in GAUDI, due to H/W limitation.

- Improve data-path code by taking out code from spinlock protection.

- Allow user to specify custom timeout per Command Submission.

- Some enhancements to debugfs.

- Various minor changes and improvements.

* tag 'misc-habanalabs-next-2021-04-10' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (41 commits)
  habanalabs: print f/w boot unknown error
  habanalabs: update to latest F/W communication header
  habanalabs/gaudi: skip iATU if F/W security is enabled
  habanalabs/gaudi: derive security status from pci id
  habanalabs: move dram scrub to free sequence
  habanalabs: send dynamic msi-x indexes to f/w
  habanalabs/gaudi: clear QM errors only if not in stop_on_err mode
  habanalabs: support DEVICE_UNUSABLE error indication from FW
  habanalabs: use strscpy instead of sprintf and strlcpy
  habanalabs: remove the store jobs array from CS IOCTL
  habanalabs/gaudi: add debugfs to DMA from the device
  habanalabs/gaudi: sync stream add protection to SOB reset flow
  habanalabs: add custom timeout flag per cs
  habanalabs: improve utilization calculation
  habanalabs: support legacy and new pll indexes
  habanalabs: move relevant datapath work outside cs lock
  habanalabs: avoid soft lockup bug upon mapping error
  habanalabs/gaudi: Update async events header
  habanalabs/gaudi: unsecure TPC cfg status registers
  habanalabs/gaudi: always use single-msi mode
  ...
This commit is contained in:
Greg Kroah-Hartman
2021-04-11 08:52:09 +02:00
31 changed files with 2203 additions and 527 deletions

View File

@@ -297,6 +297,7 @@ enum hl_device_status {
#define HL_INFO_SYNC_MANAGER 14
#define HL_INFO_TOTAL_ENERGY 15
#define HL_INFO_PLL_FREQUENCY 16
#define HL_INFO_POWER 17
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@@ -410,6 +411,14 @@ struct hl_pll_frequency_info {
__u16 output[HL_PLL_NUM_OUTPUTS];
};
/**
* struct hl_power_info - power information
* @power: power consumption
*/
struct hl_power_info {
__u64 power;
};
/**
* struct hl_info_sync_manager - sync manager information
* @first_available_sync_object: first available sob
@@ -621,6 +630,7 @@ struct hl_cs_chunk {
#define HL_CS_FLAGS_STAGED_SUBMISSION 0x40
#define HL_CS_FLAGS_STAGED_SUBMISSION_FIRST 0x80
#define HL_CS_FLAGS_STAGED_SUBMISSION_LAST 0x100
#define HL_CS_FLAGS_CUSTOM_TIMEOUT 0x200
#define HL_CS_STATUS_SUCCESS 0
@@ -634,17 +644,10 @@ struct hl_cs_in {
/* holds address of array of hl_cs_chunk for execution phase */
__u64 chunks_execute;
union {
/* this holds address of array of hl_cs_chunk for store phase -
* Currently not in use
*/
__u64 chunks_store;
/* Sequence number of a staged submission CS
* valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set
*/
__u64 seq;
};
/* Sequence number of a staged submission CS
* valid only if HL_CS_FLAGS_STAGED_SUBMISSION is set
*/
__u64 seq;
/* Number of chunks in restore phase array. Maximum number is
* HL_MAX_JOBS_PER_CS
@@ -656,8 +659,10 @@ struct hl_cs_in {
*/
__u32 num_chunks_execute;
/* Number of chunks in restore phase array - Currently not in use */
__u32 num_chunks_store;
/* timeout in seconds - valid only if HL_CS_FLAGS_CUSTOM_TIMEOUT
* is set
*/
__u32 timeout;
/* HL_CS_FLAGS_* */
__u32 cs_flags;
@@ -682,14 +687,46 @@ union hl_cs_args {
struct hl_cs_out out;
};
#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2
#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
struct hl_wait_cs_in {
/* Command submission sequence number */
__u64 seq;
/* Absolute timeout to wait in microseconds */
__u64 timeout_us;
union {
struct {
/* Command submission sequence number */
__u64 seq;
/* Absolute timeout to wait for command submission
* in microseconds
*/
__u64 timeout_us;
};
struct {
/* User address for completion comparison.
* upon interrupt, driver will compare the value pointed
* by this address with the supplied target value.
* in order not to perform any comparison, set address
* to all 1s.
* Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
*/
__u64 addr;
/* Target value for completion comparison */
__u32 target;
/* Absolute timeout to wait for interrupt
* in microseconds
*/
__u32 interrupt_timeout_us;
};
};
/* Context ID - Currently not in use */
__u32 ctx_id;
__u32 pad;
/* HL_WAIT_CS_FLAGS_*
* If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include
* interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order
* not to specify an interrupt id ,set mask to all 1s.
*/
__u32 flags;
};
#define HL_WAIT_CS_STATUS_COMPLETED 0
@@ -999,8 +1036,8 @@ struct hl_debug_args {
* Each JOB will be enqueued on a specific queue, according to the user's input.
* There can be more then one JOB per queue.
*
* The CS IOCTL will receive three sets of JOBS. One set is for "restore" phase,
* a second set is for "execution" phase and a third set is for "store" phase.
* The CS IOCTL will receive two sets of JOBS. One set is for "restore" phase
* and a second set is for "execution" phase.
* The JOBS on the "restore" phase are enqueued only after context-switch
* (or if its the first CS for this context). The user can also order the
* driver to run the "restore" phase explicitly