From 59fe400d433137c48de81650026922a88e167177 Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Wed, 19 Aug 2020 16:56:24 +0200 Subject: [PATCH] : Umount volumes with force In 4.5, NFS driver pod used 'clientaddr=' as a mount option. When such pod restarts, it gets a different IP address and volumes mounted by the old pod are basically broken - all operations (stat, umount) will just hang forever. Therefore: 1. Do not use IsLikelyNotMountPoint - it runs stat() on the volume, which could hang forever. Read /etc/mounts instead. 2. Use "umount -f" to forcefully remove NFS mount. Carry at least for whole OCP 4.6, where we need to support move from 4.5. --- pkg/nfs/nodeserver.go | 89 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 12 deletions(-) diff --git a/pkg/nfs/nodeserver.go b/pkg/nfs/nodeserver.go index 5360f6fa..c712c994 100644 --- a/pkg/nfs/nodeserver.go +++ b/pkg/nfs/nodeserver.go @@ -19,7 +19,9 @@ package nfs import ( "fmt" "os" + "os/exec" "strings" + "time" "github.com/golang/glog" @@ -35,6 +37,11 @@ type nodeServer struct { mounter mount.Interface } +const ( + // Deadline for unmount. After this time, umount -f is performed. + unmountTimeout = time.Minute +) + func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error) { targetPath := req.GetTargetPath() notMnt, err := ns.mounter.IsLikelyNotMountPoint(targetPath) @@ -82,29 +89,87 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis return &csi.NodePublishVolumeResponse{}, nil } -func (ns *nodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error) { - targetPath := req.GetTargetPath() - notMnt, err := ns.mounter.IsLikelyNotMountPoint(targetPath) - +func (ns *nodeServer) IsNotMountPoint(path string) (bool, error) { + mtab, err := ns.mounter.List() if err != nil { - if os.IsNotExist(err) { - return nil, status.Error(codes.NotFound, "Targetpath not found") - } else { - return nil, status.Error(codes.Internal, err.Error()) + return false, err + } + + for _, mnt := range mtab { + // This is how a directory deleted on the NFS server looks like + deletedDir := fmt.Sprintf("%s\\040(deleted)", mnt.Path) + + if mnt.Path == path || mnt.Path == deletedDir { + return false, nil } } - if notMnt { - return nil, status.Error(codes.NotFound, "Volume not mounted") - } + return true, nil +} - err = mount.CleanupMountPoint(req.GetTargetPath(), ns.mounter, false) +func (ns *nodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error) { + targetPath := req.GetTargetPath() + glog.V(6).Infof("NodeUnpublishVolume started for %s", targetPath) + + notMnt, err := ns.IsNotMountPoint(targetPath) if err != nil { return nil, status.Error(codes.Internal, err.Error()) } + glog.V(4).Infof("NodeUnpublishVolume: path %s is *not* a mount point: %t", targetPath, notMnt) + if !notMnt { + + err := ns.tryUnmount(targetPath) + if err != nil { + if err == context.DeadlineExceeded { + glog.V(2).Infof("Timed out waiting for unmount of %s, trying with -f", targetPath) + err = ns.forceUnmount(targetPath) + } + } + if err != nil { + return nil, status.Error(codes.Internal, err.Error()) + } + glog.V(2).Infof("Unmounted %s", targetPath) + } + + if err := os.Remove(targetPath); err != nil { + if !os.IsNotExist(err) { + return nil, status.Error(codes.Internal, err.Error()) + } + } + glog.V(4).Infof("Cleaned %s", targetPath) + return &csi.NodeUnpublishVolumeResponse{}, nil } +// tryUnmount calls plain "umount" and waits for unmountTimeout for it to finish. +func (ns *nodeServer) tryUnmount(path string) error { + ctx, cancel := context.WithTimeout(context.Background(), unmountTimeout) + defer cancel() + + cmd := exec.CommandContext(ctx, "umount", path) + out, cmderr := cmd.CombinedOutput() + + // CombinedOutput() does not return DeadlineExceeded, make sure it's + // propagated on timeout. + if ctx.Err() != nil { + return ctx.Err() + } + + if cmderr != nil { + return fmt.Errorf("failed to unmount volume: %s: %s", cmderr, string(out)) + } + return nil +} + +func (ns *nodeServer) forceUnmount(path string) error { + cmd := exec.Command("umount", "-f", path) + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to force-unmount volume: %s: %s", err, string(out)) + } + return nil +} + func (ns *nodeServer) NodeGetInfo(ctx context.Context, req *csi.NodeGetInfoRequest) (*csi.NodeGetInfoResponse, error) { glog.V(5).Infof("Using default NodeGetInfo")