linux/drivers/s390/scsi/zfcp_dbf.h

429 lines
11 KiB
C
Raw Normal View History

/*
* zfcp device driver
* debug feature declarations
*
* Copyright IBM Corp. 2008, 2016
*/
#ifndef ZFCP_DBF_H
#define ZFCP_DBF_H
#include <scsi/fc/fc_fcp.h>
#include "zfcp_ext.h"
#include "zfcp_fsf.h"
#include "zfcp_def.h"
#define ZFCP_DBF_TAG_LEN 7
#define ZFCP_DBF_INVALID_LUN 0xFFFFFFFFFFFFFFFFull
zfcp: close window with unblocked rport during rport gone On a successful end of reopen port forced, zfcp_erp_strategy_followup_success() re-uses the port erp_action and the subsequent zfcp_erp_action_cleanup() now sees ZFCP_ERP_SUCCEEDED with erp_action->action==ZFCP_ERP_ACTION_REOPEN_PORT instead of ZFCP_ERP_ACTION_REOPEN_PORT_FORCED but must not perform zfcp_scsi_schedule_rport_register(). We can detect this because the fresh port reopen erp_action is in its very first step ZFCP_ERP_STEP_UNINITIALIZED. Otherwise this opens a time window with unblocked rport (until the followup port reopen recovery would block it again). If a scsi_cmnd timeout occurs during this time window fc_timed_out() cannot work as desired and such command would indeed time out and trigger scsi_eh. This prevents a clean and timely path failover. This should not happen if the path issue can be recovered on FC transport layer such as path issues involving RSCNs. Also, unnecessary and repeated DID_IMM_RETRY for pending and undesired new requests occur because internally zfcp still has its zfcp_port blocked. As follow-on errors with scsi_eh, it can cause, in the worst case, permanently lost paths due to one of: sd <scsidev>: [<scsidisk>] Medium access timeout failure. Offlining disk! sd <scsidev>: Device offlined - not ready after error recovery For fix validation and to aid future debugging with other recoveries we now also trace (un)blocking of rports. Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Fixes: 5767620c383a ("[SCSI] zfcp: Do not unblock rport from REOPEN_PORT_FORCED") Fixes: a2fa0aede07c ("[SCSI] zfcp: Block FC transport rports early on errors") Fixes: 5f852be9e11d ("[SCSI] zfcp: Fix deadlock between zfcp ERP and SCSI") Fixes: 338151e06608 ("[SCSI] zfcp: make use of fc_remote_port_delete when target port is unavailable") Fixes: 3859f6a248cb ("[PATCH] zfcp: add rports to enable scsi_add_device to work again") Cc: <stable@vger.kernel.org> #2.6.32+ Reviewed-by: Benjamin Block <bblock@linux.vnet.ibm.com> Reviewed-by: Hannes Reinecke <hare@suse.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2016-08-10 16:30:46 +00:00
enum zfcp_dbf_pseudo_erp_act_type {
ZFCP_PSEUDO_ERP_ACTION_RPORT_ADD = 0xff,
ZFCP_PSEUDO_ERP_ACTION_RPORT_DEL = 0xfe,
};
/**
* struct zfcp_dbf_rec_trigger - trace record for triggered recovery action
* @ready: number of ready recovery actions
* @running: number of running recovery actions
* @want: wanted recovery action
* @need: needed recovery action
*/
struct zfcp_dbf_rec_trigger {
u32 ready;
u32 running;
u8 want;
u8 need;
} __packed;
/**
* struct zfcp_dbf_rec_running - trace record for running recovery
* @fsf_req_id: request id for fsf requests
* @rec_status: status of the fsf request
* @rec_step: current step of the recovery action
* rec_count: recovery counter
*/
struct zfcp_dbf_rec_running {
u64 fsf_req_id;
u32 rec_status;
u16 rec_step;
u8 rec_action;
u8 rec_count;
} __packed;
/**
* enum zfcp_dbf_rec_id - recovery trace record id
* @ZFCP_DBF_REC_TRIG: triggered recovery identifier
* @ZFCP_DBF_REC_RUN: running recovery identifier
*/
enum zfcp_dbf_rec_id {
ZFCP_DBF_REC_TRIG = 1,
ZFCP_DBF_REC_RUN = 2,
};
/**
* struct zfcp_dbf_rec - trace record for error recovery actions
* @id: unique number of recovery record type
* @tag: identifier string specifying the location of initiation
* @lun: logical unit number
* @wwpn: word wide port number
* @d_id: destination ID
* @adapter_status: current status of the adapter
* @port_status: current status of the port
* @lun_status: current status of the lun
* @u.trig: structure zfcp_dbf_rec_trigger
* @u.run: structure zfcp_dbf_rec_running
*/
struct zfcp_dbf_rec {
u8 id;
char tag[ZFCP_DBF_TAG_LEN];
u64 lun;
u64 wwpn;
u32 d_id;
u32 adapter_status;
u32 port_status;
u32 lun_status;
union {
struct zfcp_dbf_rec_trigger trig;
struct zfcp_dbf_rec_running run;
} u;
} __packed;
/**
* enum zfcp_dbf_san_id - SAN trace record identifier
* @ZFCP_DBF_SAN_REQ: request trace record id
* @ZFCP_DBF_SAN_RES: response trace record id
* @ZFCP_DBF_SAN_ELS: extended link service record id
*/
enum zfcp_dbf_san_id {
ZFCP_DBF_SAN_REQ = 1,
ZFCP_DBF_SAN_RES = 2,
ZFCP_DBF_SAN_ELS = 3,
};
/** struct zfcp_dbf_san - trace record for SAN requests and responses
* @id: unique number of recovery record type
* @tag: identifier string specifying the location of initiation
* @fsf_req_id: request id for fsf requests
* @payload: unformatted information related to request/response
* @d_id: destination id
*/
struct zfcp_dbf_san {
u8 id;
char tag[ZFCP_DBF_TAG_LEN];
u64 fsf_req_id;
u32 d_id;
#define ZFCP_DBF_SAN_MAX_PAYLOAD (FC_CT_HDR_LEN + 32)
char payload[ZFCP_DBF_SAN_MAX_PAYLOAD];
u16 pl_len;
} __packed;
/**
* struct zfcp_dbf_hba_res - trace record for hba responses
* @req_issued: timestamp when request was issued
* @prot_status: protocol status
* @prot_status_qual: protocol status qualifier
* @fsf_status: fsf status
* @fsf_status_qual: fsf status qualifier
*/
struct zfcp_dbf_hba_res {
u64 req_issued;
u32 prot_status;
u8 prot_status_qual[FSF_PROT_STATUS_QUAL_SIZE];
u32 fsf_status;
u8 fsf_status_qual[FSF_STATUS_QUALIFIER_SIZE];
u32 port_handle;
u32 lun_handle;
} __packed;
/**
* struct zfcp_dbf_hba_uss - trace record for unsolicited status
* @status_type: type of unsolicited status
* @status_subtype: subtype of unsolicited status
* @d_id: destination ID
* @lun: logical unit number
* @queue_designator: queue designator
*/
struct zfcp_dbf_hba_uss {
u32 status_type;
u32 status_subtype;
u32 d_id;
u64 lun;
u64 queue_designator;
} __packed;
/**
* enum zfcp_dbf_hba_id - HBA trace record identifier
* @ZFCP_DBF_HBA_RES: response trace record
* @ZFCP_DBF_HBA_USS: unsolicited status trace record
* @ZFCP_DBF_HBA_BIT: bit error trace record
*/
enum zfcp_dbf_hba_id {
ZFCP_DBF_HBA_RES = 1,
ZFCP_DBF_HBA_USS = 2,
ZFCP_DBF_HBA_BIT = 3,
[SCSI] zfcp: Do not wakeup while suspended If the mapping of FCP device bus ID and corresponding subchannel is modified while the Linux image is suspended, the resume of FCP devices can fail. During resume, zfcp gets callbacks from cio regarding the modified subchannels but they can be arbitrarily mixed with the restore/resume callback. Since the cio callbacks would trigger adapter recovery, zfcp could wakeup before the resume callback. Therefore, ignore the cio callbacks regarding subchannels while being suspended. We can safely do so, since zfcp does not deal itself with subchannels. For problem determination purposes, we still trace the ignored callback events. The following kernel messages could be seen on resume: kernel: <WWPN>: parent <FCP device bus ID> should not be sleeping As part of adapter reopen recovery, zfcp performs auto port scanning which can erroneously try to register new remote ports with scsi_transport_fc and the device core code complains about the parent (adapter) still sleeping. kernel: zfcp.3dff9c: <FCP device bus ID>:\ Setting up the QDIO connection to the FCP adapter failed <last kernel message repeated 3 more times> kernel: zfcp.574d43: <FCP device bus ID>:\ ERP cannot recover an error on the FCP device In such cases, the adapter gave up recovery and remained blocked along with its child objects: remote ports and LUNs/scsi devices. Even the adapter shutdown as part of giving up recovery failed because the ccw device state remained disconnected. Later, the corresponding remote ports ran into dev_loss_tmo. As a result, the LUNs were erroneously not available again after resume. Even a manually triggered adapter recovery (e.g. sysfs attribute failed, or device offline/online via sysfs) could not recover the adapter due to the remaining disconnected state of the corresponding ccw device. Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Cc: <stable@vger.kernel.org> #2.6.32+ Signed-off-by: James Bottomley <JBottomley@Parallels.com>
2012-09-04 13:23:32 +00:00
ZFCP_DBF_HBA_BASIC = 4,
};
/**
* struct zfcp_dbf_hba - common trace record for HBA records
* @id: unique number of recovery record type
* @tag: identifier string specifying the location of initiation
* @fsf_req_id: request id for fsf requests
* @fsf_req_status: status of fsf request
* @fsf_cmd: fsf command
* @fsf_seq_no: fsf sequence number
* @pl_len: length of payload stored as zfcp_dbf_pay
* @u: record type specific data
*/
struct zfcp_dbf_hba {
u8 id;
char tag[ZFCP_DBF_TAG_LEN];
u64 fsf_req_id;
u32 fsf_req_status;
u32 fsf_cmd;
u32 fsf_seq_no;
u16 pl_len;
union {
struct zfcp_dbf_hba_res res;
struct zfcp_dbf_hba_uss uss;
struct fsf_bit_error_payload be;
} u;
} __packed;
/**
* enum zfcp_dbf_scsi_id - scsi trace record identifier
* @ZFCP_DBF_SCSI_CMND: scsi command trace record
*/
enum zfcp_dbf_scsi_id {
ZFCP_DBF_SCSI_CMND = 1,
};
/**
* struct zfcp_dbf_scsi - common trace record for SCSI records
* @id: unique number of recovery record type
* @tag: identifier string specifying the location of initiation
* @scsi_id: scsi device id
* @scsi_lun: scsi device logical unit number
* @scsi_result: scsi result
* @scsi_retries: current retry number of scsi request
* @scsi_allowed: allowed retries
* @fcp_rsp_info: FCP response info
* @scsi_opcode: scsi opcode
* @fsf_req_id: request id of fsf request
* @host_scribble: LLD specific data attached to SCSI request
* @pl_len: length of paload stored as zfcp_dbf_pay
* @fsf_rsp: response for fsf request
*/
struct zfcp_dbf_scsi {
u8 id;
char tag[ZFCP_DBF_TAG_LEN];
u32 scsi_id;
u32 scsi_lun;
u32 scsi_result;
u8 scsi_retries;
u8 scsi_allowed;
u8 fcp_rsp_info;
#define ZFCP_DBF_SCSI_OPCODE 16
u8 scsi_opcode[ZFCP_DBF_SCSI_OPCODE];
u64 fsf_req_id;
u64 host_scribble;
u16 pl_len;
struct fcp_resp_with_ext fcp_rsp;
} __packed;
/**
* struct zfcp_dbf_pay - trace record for unformatted payload information
* @area: area this record is originated from
* @counter: ascending record number
* @fsf_req_id: request id of fsf request
* @data: unformatted data
*/
struct zfcp_dbf_pay {
u8 counter;
char area[ZFCP_DBF_TAG_LEN];
u64 fsf_req_id;
#define ZFCP_DBF_PAY_MAX_REC 0x100
char data[ZFCP_DBF_PAY_MAX_REC];
} __packed;
/**
* struct zfcp_dbf - main dbf trace structure
* @pay: reference to payload trace area
* @rec: reference to recovery trace area
* @hba: reference to hba trace area
* @san: reference to san trace area
* @scsi: reference to scsi trace area
* @pay_lock: lock protecting payload trace buffer
* @rec_lock: lock protecting recovery trace buffer
* @hba_lock: lock protecting hba trace buffer
* @san_lock: lock protecting san trace buffer
* @scsi_lock: lock protecting scsi trace buffer
* @pay_buf: pre-allocated buffer for payload
* @rec_buf: pre-allocated buffer for recovery
* @hba_buf: pre-allocated buffer for hba
* @san_buf: pre-allocated buffer for san
* @scsi_buf: pre-allocated buffer for scsi
*/
struct zfcp_dbf {
debug_info_t *pay;
debug_info_t *rec;
debug_info_t *hba;
debug_info_t *san;
debug_info_t *scsi;
spinlock_t pay_lock;
spinlock_t rec_lock;
spinlock_t hba_lock;
spinlock_t san_lock;
spinlock_t scsi_lock;
struct zfcp_dbf_pay pay_buf;
struct zfcp_dbf_rec rec_buf;
struct zfcp_dbf_hba hba_buf;
struct zfcp_dbf_san san_buf;
struct zfcp_dbf_scsi scsi_buf;
};
/**
* zfcp_dbf_hba_fsf_resp_suppress - true if we should not trace by default
* @req: request that has been completed
*
* Returns true if FCP response with only benign residual under count.
*/
static inline
bool zfcp_dbf_hba_fsf_resp_suppress(struct zfcp_fsf_req *req)
{
struct fsf_qtcb *qtcb = req->qtcb;
u32 fsf_stat = qtcb->header.fsf_status;
struct fcp_resp *fcp_rsp;
u8 rsp_flags, fr_status;
if (qtcb->prefix.qtcb_type != FSF_IO_COMMAND)
return false; /* not an FCP response */
fcp_rsp = (struct fcp_resp *)&qtcb->bottom.io.fcp_rsp;
rsp_flags = fcp_rsp->fr_flags;
fr_status = fcp_rsp->fr_status;
return (fsf_stat == FSF_FCP_RSP_AVAILABLE) &&
(rsp_flags == FCP_RESID_UNDER) &&
(fr_status == SAM_STAT_GOOD);
}
static inline
void zfcp_dbf_hba_fsf_resp(char *tag, int level, struct zfcp_fsf_req *req)
{
if (debug_level_enabled(req->adapter->dbf->hba, level))
zfcp_dbf_hba_fsf_res(tag, level, req);
}
/**
* zfcp_dbf_hba_fsf_response - trace event for request completion
* @req: request that has been completed
*/
static inline
void zfcp_dbf_hba_fsf_response(struct zfcp_fsf_req *req)
{
struct fsf_qtcb *qtcb = req->qtcb;
if ((qtcb->prefix.prot_status != FSF_PROT_GOOD) &&
(qtcb->prefix.prot_status != FSF_PROT_FSF_STATUS_PRESENTED)) {
zfcp_dbf_hba_fsf_resp("fs_perr", 1, req);
} else if (qtcb->header.fsf_status != FSF_GOOD) {
zfcp_dbf_hba_fsf_resp("fs_ferr",
zfcp_dbf_hba_fsf_resp_suppress(req)
? 5 : 1, req);
} else if ((req->fsf_command == FSF_QTCB_OPEN_PORT_WITH_DID) ||
(req->fsf_command == FSF_QTCB_OPEN_LUN)) {
zfcp_dbf_hba_fsf_resp("fs_open", 4, req);
} else if (qtcb->header.log_length) {
zfcp_dbf_hba_fsf_resp("fs_qtcb", 5, req);
} else {
zfcp_dbf_hba_fsf_resp("fs_norm", 6, req);
}
}
static inline
void _zfcp_dbf_scsi(char *tag, int level, struct scsi_cmnd *scmd,
struct zfcp_fsf_req *req)
{
struct zfcp_adapter *adapter = (struct zfcp_adapter *)
scmd->device->host->hostdata[0];
if (debug_level_enabled(adapter->dbf->scsi, level))
zfcp_dbf_scsi(tag, level, scmd, req);
}
/**
* zfcp_dbf_scsi_result - trace event for SCSI command completion
* @scmd: SCSI command pointer
* @req: FSF request used to issue SCSI command
*/
static inline
void zfcp_dbf_scsi_result(struct scsi_cmnd *scmd, struct zfcp_fsf_req *req)
{
if (scmd->result != 0)
_zfcp_dbf_scsi("rsl_err", 3, scmd, req);
else if (scmd->retries > 0)
_zfcp_dbf_scsi("rsl_ret", 4, scmd, req);
else
_zfcp_dbf_scsi("rsl_nor", 6, scmd, req);
}
/**
* zfcp_dbf_scsi_fail_send - trace event for failure to send SCSI command
* @scmd: SCSI command pointer
*/
static inline
void zfcp_dbf_scsi_fail_send(struct scsi_cmnd *scmd)
{
_zfcp_dbf_scsi("rsl_fai", 4, scmd, NULL);
}
/**
* zfcp_dbf_scsi_abort - trace event for SCSI command abort
* @tag: tag indicating success or failure of abort operation
* @scmd: SCSI command to be aborted
* @fsf_req: request containing abort (might be NULL)
*/
static inline
void zfcp_dbf_scsi_abort(char *tag, struct scsi_cmnd *scmd,
struct zfcp_fsf_req *fsf_req)
{
_zfcp_dbf_scsi(tag, 1, scmd, fsf_req);
}
/**
* zfcp_dbf_scsi_devreset - trace event for Logical Unit or Target Reset
* @tag: tag indicating success or failure of reset operation
* @scmnd: SCSI command which caused this error recovery
* @flag: indicates type of reset (Target Reset, Logical Unit Reset)
*/
static inline
void zfcp_dbf_scsi_devreset(char *tag, struct scsi_cmnd *scmnd, u8 flag)
{
char tmp_tag[ZFCP_DBF_TAG_LEN];
if (flag == FCP_TMF_TGT_RESET)
memcpy(tmp_tag, "tr_", 3);
else
memcpy(tmp_tag, "lr_", 3);
memcpy(&tmp_tag[3], tag, 4);
_zfcp_dbf_scsi(tmp_tag, 1, scmnd, NULL);
}
scsi: zfcp: fix use-after-"free" in FC ingress path after TMF When SCSI EH invokes zFCP's callbacks for eh_device_reset_handler() and eh_target_reset_handler(), it expects us to relent the ownership over the given scsi_cmnd and all other scsi_cmnds within the same scope - LUN or target - when returning with SUCCESS from the callback ('release' them). SCSI EH can then reuse those commands. We did not follow this rule to release commands upon SUCCESS; and if later a reply arrived for one of those supposed to be released commands, we would still make use of the scsi_cmnd in our ingress tasklet. This will at least result in undefined behavior or a kernel panic because of a wrong kernel pointer dereference. To fix this, we NULLify all pointers to scsi_cmnds (struct zfcp_fsf_req *)->data in the matching scope if a TMF was successful. This is done under the locks (struct zfcp_adapter *)->abort_lock and (struct zfcp_reqlist *)->lock to prevent the requests from being removed from the request-hashtable, and the ingress tasklet from making use of the scsi_cmnd-pointer in zfcp_fsf_fcp_cmnd_handler(). For cases where a reply arrives during SCSI EH, but before we get a chance to NULLify the pointer - but before we return from the callback -, we assume that the code is protected from races via the CAS operation in blk_complete_request() that is called in scsi_done(). The following stacktrace shows an example for a crash resulting from the previous behavior: Unable to handle kernel pointer dereference at virtual kernel address fffffee17a672000 Oops: 0038 [#1] SMP CPU: 2 PID: 0 Comm: swapper/2 Not tainted task: 00000003f7ff5be0 ti: 00000003f3d38000 task.ti: 00000003f3d38000 Krnl PSW : 0404d00180000000 00000000001156b0 (smp_vcpu_scheduled+0x18/0x40) R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 EA:3 Krnl GPRS: 000000200000007e 0000000000000000 fffffee17a671fd8 0000000300000015 ffffffff80000000 00000000005dfde8 07000003f7f80e00 000000004fa4e800 000000036ce8d8f8 000000036ce8d9c0 00000003ece8fe00 ffffffff969c9e93 00000003fffffffd 000000036ce8da10 00000000003bf134 00000003f3b07918 Krnl Code: 00000000001156a2: a7190000 lghi %r1,0 00000000001156a6: a7380015 lhi %r3,21 #00000000001156aa: e32050000008 ag %r2,0(%r5) >00000000001156b0: 482022b0 lh %r2,688(%r2) 00000000001156b4: ae123000 sigp %r1,%r2,0(%r3) 00000000001156b8: b2220020 ipm %r2 00000000001156bc: 8820001c srl %r2,28 00000000001156c0: c02700000001 xilf %r2,1 Call Trace: ([<0000000000000000>] 0x0) [<000003ff807bdb8e>] zfcp_fsf_fcp_cmnd_handler+0x3de/0x490 [zfcp] [<000003ff807be30a>] zfcp_fsf_req_complete+0x252/0x800 [zfcp] [<000003ff807c0a48>] zfcp_fsf_reqid_check+0xe8/0x190 [zfcp] [<000003ff807c194e>] zfcp_qdio_int_resp+0x66/0x188 [zfcp] [<000003ff80440c64>] qdio_kick_handler+0xdc/0x310 [qdio] [<000003ff804463d0>] __tiqdio_inbound_processing+0xf8/0xcd8 [qdio] [<0000000000141fd4>] tasklet_action+0x9c/0x170 [<0000000000141550>] __do_softirq+0xe8/0x258 [<000000000010ce0a>] do_softirq+0xba/0xc0 [<000000000014187c>] irq_exit+0xc4/0xe8 [<000000000046b526>] do_IRQ+0x146/0x1d8 [<00000000005d6a3c>] io_return+0x0/0x8 [<00000000005d6422>] vtime_stop_cpu+0x4a/0xa0 ([<0000000000000000>] 0x0) [<0000000000103d8a>] arch_cpu_idle+0xa2/0xb0 [<0000000000197f94>] cpu_startup_entry+0x13c/0x1f8 [<0000000000114782>] smp_start_secondary+0xda/0xe8 [<00000000005d6efe>] restart_int_handler+0x56/0x6c [<0000000000000000>] 0x0 Last Breaking-Event-Address: [<00000000003bf12e>] arch_spin_lock_wait+0x56/0xb0 Suggested-by: Steffen Maier <maier@linux.vnet.ibm.com> Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com> Fixes: ea127f9754 ("[PATCH] s390 (7/7): zfcp host adapter.") (tglx/history.git) Cc: <stable@vger.kernel.org> #2.6.32+ Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2016-12-09 16:16:31 +00:00
/**
* zfcp_dbf_scsi_nullcmnd() - trace NULLify of SCSI command in dev/tgt-reset.
* @scmnd: SCSI command that was NULLified.
* @fsf_req: request that owned @scmnd.
*/
static inline void zfcp_dbf_scsi_nullcmnd(struct scsi_cmnd *scmnd,
struct zfcp_fsf_req *fsf_req)
{
_zfcp_dbf_scsi("scfc__1", 3, scmnd, fsf_req);
}
#endif /* ZFCP_DBF_H */