diff options
author | Benjamin Coddington <bcodding@redhat.com> | 2015-01-16 14:23:41 -0500 |
---|---|---|
committer | Lukas Nykryn <lnykryn@redhat.com> | 2015-02-27 10:57:23 +0100 |
commit | 1e58eb682bd10c14d5b807130f18528545725a40 (patch) | |
tree | 97dbcabe9e07f1cf633914aeb468146b4ad61a90 | |
parent | 502d080dae0a3d604a0ff77fc4b0338102b847bc (diff) | |
download | initscripts-1e58eb682bd10c14d5b807130f18528545725a40.tar initscripts-1e58eb682bd10c14d5b807130f18528545725a40.tar.gz initscripts-1e58eb682bd10c14d5b807130f18528545725a40.tar.bz2 initscripts-1e58eb682bd10c14d5b807130f18528545725a40.tar.xz initscripts-1e58eb682bd10c14d5b807130f18528545725a40.zip |
netfs: don't race NFS umount to network shutdown
RHEL6 customers have been reporting hangs when restaring due to IO
for NFS filesystems being unable to flush after network shutdown.
The current __umount_loop allows newly created processes to continue
to open files to NFS filesystems, which can create thisi problem.
Change the umount logic to perform a MNT_DETACH, then search for
processes that have open file descriptors on the detached filesystems.
The detach prevents newly created processes from opening new files
during our search. Finally, after making every attempt to clean up
processes with open files, perform a sync to flush NFS filesystems
before continuing onto network shutdown.
-rwxr-xr-x | rc.d/init.d/netfs | 62 |
1 files changed, 57 insertions, 5 deletions
diff --git a/rc.d/init.d/netfs b/rc.d/init.d/netfs index 26b8e351..acd8f970 100755 --- a/rc.d/init.d/netfs +++ b/rc.d/init.d/netfs @@ -121,11 +121,63 @@ case "$1" in $"Unmounting GLUSTERFS filesystems (retry): " fi if [ -n "$NFSMTAB" ]; then - __umount_loop '$3 ~ /^nfs/ && $3 != "nfsd" && $2 != "/" {print $2}' \ - /proc/mounts \ - $"Unmounting NFS filesystems: " \ - $"Unmounting NFS filesystems (retry): " \ - "-f -l" + STRING=$"Unmounting NFS filesystems:" + echo -n $STRING + nfs_fs=$(LC_ALL=C awk '/^#/ {next} $3 ~ /^nfs/ && $3 != "nfsd" && $2 != "/" {print $2}' /proc/mounts | sort -r) + if [ -n "$nfs_fs" ]; then + # create a device id reference + devs=$(stat -c "%d" $nfs_fs) + + # the lazy umount + for fs in $nfs_fs ; do + umount -l $fs + done + + # find fds that don't start with /, are not sockets or pipes or other. + # these are potentially detached fds + detached_fds=$(find /proc/ -regex '/proc/[0-9]+/fd/.*' -printf "%p %l\n" 2>/dev/null |\ + grep -Ev '/proc/[0-9]+/fd/[0-9]+ (/.*|inotify|\[.+\]|(socket|pipe):\[[0-9]+\])') + + # check each detached fd to see if it has the same device + # as one of our lazy umounted filesystems + kill_list= + [ -n "$detached_fds" ] && while read fdline; do + fd=${fdline%% *} + pid=$(echo $fdline | sed -r 's/\/proc\/([0-9]+).+/\1/') + fd_dev=$(stat -L -c "%d" $fd) + for dev in $devs ; do + [ "$dev" = "$fd_dev" ] && kill_list+="$pid " + done + done <<< "$detached_fds" + + [ -n "$kill_list" ] && kill $kill_list + + # run a little wait/check loop for procs to exit + count=4 + while [ "$count" -gt 0 ] ; do + [ -z "$kill_list" ] && break + count=$(($count-1)) + usleep 500000 + remaining= + for pid in $kill_list ; do + [ -d "/proc/$pid" ] && remaining+="$pid " + done + kill_list=$remaining + done + + # try to finish the job: + if [ -n "$kill_list" ] ; then + kill -9 $kill_list + usleep 500000 + # last check + remaining= + for pid in $kill_list ; do + [ -d "/proc/$pid" ] && remaining+="$pid " + done + fi + [ -z "$remaining" ] && success "$STRING" || failure "$STRING" + echo + fi fi [ -n "$CIFSMTAB" ] && action $"Unmounting CIFS filesystems: " umount -a -t cifs [ -n "$NCPMTAB" ] && action $"Unmounting NCP filesystems: " umount -a -t ncp,ncpfs |