From 73d4f7aea9a7d30da20f7930d6d6fd46f5ebc336 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Thu, 12 Oct 2006 10:32:32 +0000 Subject: [PATCH] Big performance improvement for fts-based tools that use FTS_NOSTAT. Avoid spurious inode-mismatch problems on non-POSIX file systems. Details: http://article.gmane.org/gmane.comp.lib.gnulib.bugs/7416 * lib/fts_.h (FTS_DEFER_STAT): Define new flag. (FTS_OPTIONMASK): Extend the mask to reflect this addition. * lib/fts.c (DT_IS_KNOWN, DT_MUST_BE): Define. (FTS_NO_STAT_REQUIRED, FTS_STAT_REQUIRED): Define. (fts_set_stat_required): New function. (fts_open): Defer the calls to fts_stat, if possible or requested. Move the code that maps a command-line fts_info value FTS_DOT to FTS_D into fts_stat itself. (fts_read): Perform any required (deferred) fts_stat call. (fts_build): Likewise, for the directory we're about to open and read. In the readdir loop, carefully decide whether each entry will require an eventual call to fts_stat, using dirent.d_type info if available. (fts_stat): Move the test for whether to honor FTS_COMFOLLOW on a command line argument into this function. Update all callers. Map a return value of FTS_DOT to FTS_D for a command line argument. * modules/fts (Depends-on): Add d-type. Alphabetize. Thanks to Miklos Szeredi for his tenacity and for the initial bug report about "find" failing on a FUSE-based file system. --- ChangeLog | 24 +++++++++++++ lib/fts.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------ lib/fts_.h | 14 +++++++- modules/fts | 3 +- 4 files changed, 146 insertions(+), 13 deletions(-) diff --git a/ChangeLog b/ChangeLog index 1a6da7ac8..edf3fd67f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +2006-10-12 Jim Meyering + + Big performance improvement for fts-based tools that use FTS_NOSTAT. + Avoid spurious inode-mismatch problems on non-POSIX file systems. + Details: http://article.gmane.org/gmane.comp.lib.gnulib.bugs/7416 + * lib/fts_.h (FTS_DEFER_STAT): Define new flag. + (FTS_OPTIONMASK): Extend the mask to reflect this addition. + * lib/fts.c (DT_IS_KNOWN, DT_MUST_BE): Define. + (FTS_NO_STAT_REQUIRED, FTS_STAT_REQUIRED): Define. + (fts_set_stat_required): New function. + (fts_open): Defer the calls to fts_stat, if possible or requested. + Move the code that maps a command-line fts_info value FTS_DOT to FTS_D + into fts_stat itself. + (fts_read): Perform any required (deferred) fts_stat call. + (fts_build): Likewise, for the directory we're about to open and read. + In the readdir loop, carefully decide whether each entry will require + an eventual call to fts_stat, using dirent.d_type info if available. + (fts_stat): Move the test for whether to honor FTS_COMFOLLOW on + a command line argument into this function. Update all callers. + Map a return value of FTS_DOT to FTS_D for a command line argument. + * modules/fts (Depends-on): Add d-type. Alphabetize. + Thanks to Miklos Szeredi for his tenacity and for the initial + bug report about "find" failing on a FUSE-based file system. + 2006-10-12 Paul Eggert * m4/extensions.m4 (AC_USE_SYSTEM_EXTENSIONS): Renamed from diff --git a/lib/fts.c b/lib/fts.c index 07fe170f2..6eca31089 100644 --- a/lib/fts.c +++ b/lib/fts.c @@ -81,6 +81,22 @@ static char sccsid[] = "@(#)fts.c 8.6 (Berkeley) 8/14/94"; # define _D_EXACT_NAMLEN(dirent) strlen ((dirent)->d_name) #endif +#if HAVE_STRUCT_DIRENT_D_TYPE +/* True if the type of the directory entry D is known. */ +# define DT_IS_KNOWN(d) ((d)->d_type != DT_UNKNOWN) +/* True if the type of the directory entry D must be T. */ +# define DT_MUST_BE(d, t) ((d)->d_type == (t)) +#else +# define DT_IS_KNOWN(d) false +# define DT_MUST_BE(d, t) false +#endif + +enum +{ + FTS_NO_STAT_REQUIRED = 1, + FTS_STAT_REQUIRED = 2 +}; + #ifdef _LIBC # undef close # define close __close @@ -195,6 +211,19 @@ bool fts_debug = false; } \ while (false) +/* Overload the fts_statp->st_size member (otherwise unused, when + fts_info is FTS_NSOK) to indicate whether fts_read should stat + this entry or not. */ +static void +fts_set_stat_required (FTSENT *p, bool required) +{ + if (p->fts_info != FTS_NSOK) + abort (); + p->fts_statp->st_size = (required + ? FTS_STAT_REQUIRED + : FTS_NO_STAT_REQUIRED); +} + /* file-descriptor-relative opendir. */ /* FIXME: if others need this function, move it into lib/openat.c */ static inline DIR * @@ -258,6 +287,7 @@ fts_open (char * const *argv, FTSENT *parent = NULL; FTSENT *tmp = NULL; /* pacify gcc */ size_t len; + bool defer_stat; /* Options check. */ if (options & ~FTS_OPTIONMASK) { @@ -340,6 +370,19 @@ fts_open (char * const *argv, parent->fts_level = FTS_ROOTPARENTLEVEL; } + /* The classic fts implementation would call fts_stat with + a new entry for each iteration of the loop below. + If the comparison function is not specified or if the + FTS_DEFER_STAT option is in effect, don't stat any entry + in this loop. This is an attempt to minimize the interval + between the initial stat/lstat/fstatat and the point at which + a directory argument is first opened. This matters for any + directory command line argument that resides on a file system + without genuine i-nodes. If you specify FTS_DEFER_STAT along + with a comparison function, that function must not access any + data via the fts_statp pointer. */ + defer_stat = (compar == NULL || ISSET(FTS_DEFER_STAT)); + /* Allocate/initialize root(s). */ for (root = NULL, nitems = 0; *argv != NULL; ++argv, ++nitems) { /* Don't allow zero-length file names. */ @@ -353,11 +396,12 @@ fts_open (char * const *argv, p->fts_level = FTS_ROOTLEVEL; p->fts_parent = parent; p->fts_accpath = p->fts_name; - p->fts_info = fts_stat(sp, p, ISSET(FTS_COMFOLLOW) != 0); - - /* Command-line "." and ".." are real directories. */ - if (p->fts_info == FTS_DOT) - p->fts_info = FTS_D; + if (defer_stat) { + p->fts_info = FTS_NSOK; + fts_set_stat_required(p, true); + } else { + p->fts_info = fts_stat(sp, p, false); + } /* * If comparison routine supplied, traverse in sorted @@ -645,6 +689,19 @@ name: t = sp->fts_path + NAPPEND(p->fts_parent); *t++ = '/'; memmove(t, p->fts_name, p->fts_namelen + 1); check_for_dir: + if (p->fts_info == FTS_NSOK) + { + switch (p->fts_statp->st_size) + { + case FTS_STAT_REQUIRED: + p->fts_info = fts_stat(sp, p, false); + break; + case FTS_NO_STAT_REQUIRED: + break; + default: + abort (); + } + } sp->fts_cur = p; if (p->fts_info == FTS_D) { @@ -672,6 +729,9 @@ check_for_dir: return (sp->fts_cur = NULL); } + if (p->fts_info == FTS_NSOK) + abort (); + /* NUL terminate the file name. */ sp->fts_path[p->fts_pathlen] = '\0'; @@ -862,6 +922,11 @@ fts_build (register FTS *sp, int type) } return (NULL); } + /* Rather than calling fts_stat for each and every entry encountered + in the readdir loop (below), stat each directory only right after + opening it. */ + if (cur->fts_info == FTS_NSOK) + cur->fts_info = fts_stat(sp, cur, false); /* * Nlinks is the number of possible entries of type directory in the @@ -1004,12 +1069,38 @@ mem1: saved_errno = errno; memmove(cp, p->fts_name, p->fts_namelen + 1); } else p->fts_accpath = p->fts_name; - /* Stat it. */ - p->fts_info = fts_stat(sp, p, false); + + bool is_dir; + if (sp->fts_compar == NULL || ISSET(FTS_DEFER_STAT)) { + /* Record what fts_read will have to do with this + entry. In many cases, it will simply fts_stat it, + but we can take advantage of any d_type information + to optimize away the unnecessary stat calls. I.e., + if FTS_NOSTAT is in effect and we're not following + symlinks (FTS_PHYSICAL) and d_type indicates this + is *not* a directory, then we won't have to stat it + at all. If it *is* a directory, then (currently) + we stat it regardless, in order to get device and + inode numbers. Some day we might optimize that + away, too, for directories where d_ino is known to + be valid. */ + bool skip_stat = (ISSET(FTS_PHYSICAL) + && ISSET(FTS_NOSTAT) + && DT_IS_KNOWN(dp) + && ! DT_MUST_BE(dp, DT_DIR)); + p->fts_info = FTS_NSOK; + fts_set_stat_required(p, !skip_stat); + is_dir = (ISSET(FTS_PHYSICAL) && ISSET(FTS_NOSTAT) + && DT_MUST_BE(dp, DT_DIR)); + } else { + p->fts_info = fts_stat(sp, p, false); + is_dir = (p->fts_info == FTS_D + || p->fts_info == FTS_DC + || p->fts_info == FTS_DOT); + } /* Decrement link count if applicable. */ - if (nlinks > 0 && (p->fts_info == FTS_D || - p->fts_info == FTS_DC || p->fts_info == FTS_DOT)) + if (nlinks > 0 && is_dir) nlinks -= nostat; /* We walk in directory order so "ls -f" doesn't get upset. */ @@ -1143,6 +1234,9 @@ fts_stat(FTS *sp, register FTSENT *p, bool follow) struct stat *sbp = p->fts_statp; int saved_errno; + if (p->fts_level == FTS_ROOTLEVEL && ISSET(FTS_COMFOLLOW)) + follow = true; + #if defined FTS_WHITEOUT && 0 /* check for whiteout */ if (p->fts_flags & FTS_ISW) { @@ -1176,8 +1270,10 @@ err: memset(sbp, 0, sizeof(struct stat)); } if (S_ISDIR(sbp->st_mode)) { - if (ISDOT(p->fts_name)) - return (FTS_DOT); + if (ISDOT(p->fts_name)) { + /* Command-line "." and ".." are real directories. */ + return (p->fts_level == FTS_ROOTLEVEL ? FTS_D : FTS_DOT); + } #if _LGPL_PACKAGE { diff --git a/lib/fts_.h b/lib/fts_.h index 2843107ea..7e6a404ed 100644 --- a/lib/fts_.h +++ b/lib/fts_.h @@ -123,7 +123,19 @@ typedef struct { through the file descriptor member, fts_cwd_fd. */ # define FTS_CWDFD 0x0200 -# define FTS_OPTIONMASK 0x03ff /* valid user option mask */ + /* Historically, for each directory that fts initially encounters, it would + open it, read all entries, and stat each entry, storing the results, and + then it would process the first entry. But that behavior is bad for + locality of reference, and also causes trouble with inode-simulating + file systems like FAT, CIFS, FUSE-based ones, etc., when entries from + their name/inode cache are flushed too early. + Use this flag to make fts_open and fts_read defer the stat/lstat/fststat + of each entry until it actually processed. However, note that if you use + this option and also specify a comparison function, that function may not + examine any data via fts_statp. */ +# define FTS_DEFER_STAT 0x0400 + +# define FTS_OPTIONMASK 0x07ff /* valid user option mask */ # define FTS_NAMEONLY 0x1000 /* (private) child names only */ # define FTS_STOP 0x2000 /* (private) unrecoverable error */ diff --git a/modules/fts b/modules/fts index b100fca8c..b0e37d119 100644 --- a/modules/fts +++ b/modules/fts @@ -9,13 +9,14 @@ m4/fts.m4 Depends-on: cycle-check +d-type dirfd fcntl +fcntl-safer hash lstat openat stdbool -fcntl-safer unistd-safer configure.ac: -- 2.11.0