Commit fe245c8f authored by David Howells's avatar David Howells
Browse files

afs: Add comments on abort handling



Add some comments on AFS abort code handling in the rotation algorithm and
adjust the errors produced to match.

Reported-by: default avatarJeffrey E Altman <jaltman@auristor.com>
Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
Reviewed-by: default avatarJeffrey Altman <jaltman@auristor.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
parent bad1a11c
Loading
Loading
Loading
Loading
+90 −11
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
#include <linux/sched/signal.h>
#include "internal.h"
#include "afs_fs.h"
#include "protocol_uae.h"

/*
 * Begin iteration through a server list, starting with the vnode's last used
@@ -143,6 +144,11 @@ bool afs_select_fileserver(struct afs_operation *op)
	case -ECONNABORTED:
		/* The far side rejected the operation on some grounds.  This
		 * might involve the server being busy or the volume having been moved.
		 *
		 * Note that various V* errors should not be sent to a cache manager
		 * by a fileserver as they should be translated to more modern UAE*
		 * errors instead.  IBM AFS and OpenAFS fileservers, however, do leak
		 * these abort codes.
		 */
		switch (op->ac.abort_code) {
		case VNOVOL:
@@ -150,6 +156,11 @@ bool afs_select_fileserver(struct afs_operation *op)
			 * - May indicate that the VL is wrong - retry once and compare
			 *   the results.
			 * - May indicate that the fileserver couldn't attach to the vol.
			 * - The volume might have been temporarily removed so that it can
			 *   be replaced by a volume restore.  "vos" might have ended one
			 *   transaction and has yet to create the next.
			 * - The volume might not be blessed or might not be in-service
			 *   (administrative action).
			 */
			if (op->flags & AFS_OPERATION_VNOVOL) {
				op->error = -EREMOTEIO;
@@ -183,16 +194,56 @@ bool afs_select_fileserver(struct afs_operation *op)
			_leave(" = t [vnovol]");
			return true;

		case VSALVAGE: /* TODO: Should this return an error or iterate? */
		case VVOLEXISTS:
		case VNOSERVICE:
		case VONLINE:
		case VDISKFULL:
		case VOVERQUOTA:
			op->error = afs_abort_to_error(op->ac.abort_code);
			/* These should not be returned from the fileserver. */
			pr_warn("Fileserver returned unexpected abort %d\n",
				op->ac.abort_code);
			op->error = -EREMOTEIO;
			goto next_server;

		case VNOSERVICE:
			/* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
			 * if the volume was neither in-service nor administratively
			 * blessed.  All usage was replaced by VNOVOL because AFS 3.1 and
			 * earlier cache managers did not handle VNOSERVICE and assumed
			 * it was the client OSes errno 105.
			 *
			 * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
			 * fileserver idle dead time error which was sent in place of
			 * RX_CALL_TIMEOUT (-3).  The error was intended to be sent if the
			 * fileserver took too long to send a reply to the client.
			 * RX_CALL_TIMEOUT would have caused the cache manager to mark the
			 * server down whereas VNOSERVICE since AFS 3.2 would cause cache
			 * manager to temporarily (up to 15 minutes) mark the volume
			 * instance as unusable.
			 *
			 * The idle dead logic resulted in cache inconsistency since a
			 * state changing call that the cache manager assumed was dead
			 * could still be processed to completion by the fileserver.  This
			 * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
			 * returned.  However, many 1.4.8 through 1.6.24 fileservers are
			 * still in existence.
			 *
			 * AuriStorFS fileservers have never returned VNOSERVICE.
			 *
			 * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
			 */
		case RX_CALL_TIMEOUT:
			op->error = -ETIMEDOUT;
			goto next_server;

		case VSALVAGING: /* This error should not be leaked to cache managers
				  * but is from OpenAFS demand attach fileservers.
				  * It should be treated as an alias for VOFFLINE.
				  */
		case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
		case VOFFLINE:
			/* The volume is in use by the volserver or another volume utility
			 * for an operation that might alter the contents.  The volume is
			 * expected to come back but it might take a long time (could be
			 * days).
			 */
			if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
				afs_busy(op->volume, op->ac.abort_code);
				clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
@@ -207,11 +258,20 @@ bool afs_select_fileserver(struct afs_operation *op)
			}
			goto busy;

		case VSALVAGING:
		case VRESTARTING:
		case VRESTARTING: /* The fileserver is either shutting down or starting up. */
		case VBUSY:
			/* Retry after going round all the servers unless we
			 * have a file lock we need to maintain.
			/* The volume is in use by the volserver or another volume
			 * utility for an operation that is not expected to alter the
			 * contents of the volume.  VBUSY does not need to be returned
			 * for a ROVOL or BACKVOL bound to an ITBusy volserver
			 * transaction.  The fileserver is permitted to continue serving
			 * content from ROVOLs and BACKVOLs during an ITBusy transaction
			 * because the content will not change.  However, many fileserver
			 * releases do return VBUSY for ROVOL and BACKVOL instances under
			 * many circumstances.
			 *
			 * Retry after going round all the servers unless we have a file
			 * lock we need to maintain.
			 */
			if (op->flags & AFS_OPERATION_NO_VSLEEP) {
				op->error = -EBUSY;
@@ -270,10 +330,29 @@ bool afs_select_fileserver(struct afs_operation *op)

			goto restart_from_beginning;

		case VDISKFULL:
		case UAENOSPC:
			/* The partition is full.  Only applies to RWVOLs.
			 * Translate locally and return ENOSPC.
			 * No replicas to failover to.
			 */
			op->error = -ENOSPC;
			goto failed_but_online;

		case VOVERQUOTA:
		case UAEDQUOT:
			/* Volume is full.  Only applies to RWVOLs.
			 * Translate locally and return EDQUOT.
			 * No replicas to failover to.
			 */
			op->error = -EDQUOT;
			goto failed_but_online;

		default:
			op->error = afs_abort_to_error(op->ac.abort_code);
		failed_but_online:
			clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
			clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
			op->error = afs_abort_to_error(op->ac.abort_code);
			goto failed;
		}