November 10, 2022
On Thu, Nov 10, 2022 at 07:07:33PM +0000, Imperatorn via Digitalmars-d-learn wrote:
> On Thursday, 10 November 2022 at 16:34:53 UTC, Ali Çehreli wrote:
> > On 11/9/22 11:30, Vladimir Panteleev wrote:
> > > On Wednesday, 9 November 2022 at 19:05:58 UTC, Ali Çehreli wrote:
> > >> Running the program shows no output; 'a' is not visited as a directory entry.
> > >
> > > That's not what happens for me:
> > 
> > Does not happen for me today either. (?) I must have confused myself both with my actual program and with a trivial isolated program that I had written to test it.
> > 
> > Unless others have seen the same behavior yesterday there is no bug here today. :p
> > 
> > Ali
> > "walks away with a confused look on his face"
> 
> Oh, did you run the program on Wednesday? Fool!

I think it was because yesterday MSFT stock dipped, but today it rose by 15, so Windows is working properly again.

:-P


T

-- 
"You are a very disagreeable person." "NO."
November 10, 2022
On 11/9/22 12:06, Ali Çehreli wrote:

> I am using its sibling 'ftw'

Now that we know that dirEntries works properly, I decided not to use ftw.

However, ftw performs about twice as fast as dirEntries (despite some common code in the implementation below). I am leaving it here in case somebody finds it useful. (Why don't I put it on github then; ok, some day I will.)

import core.sys.posix.sys.stat;
import std.algorithm;
import std.exception;
import std.file;
import std.path;
import std.range;
import std.string;

// The Posix "file tree walker" function
extern (C)
int ftw(const char *dirpath,
        int function (const char *fpath, const stat_t *sb, int typeflag) fn,
        int nopenfd);

enum TypeFlag {
    FTW_F,   // regular file
    FTW_D,   // directory
    // See 'man nftw' or /usr/include/ftw.h for the other values
}

struct DirectoryEntry {
    string name;
    ulong size;
}

struct WalkResult {
    DirectoryEntry[] entries;
    string[] emptyDirs;
}

WalkResult directoryWalk_ftw(string root) {
    WalkResult impl_() {
        // These have to be 'static' because ftw() does not allow us to pass a
        // context. And that's why this function must only be called from a
        // synchronized block.
        static DirectoryEntry[] entries;
        static string[] dirs;

        entries.length = 0;
        entries.assumeSafeAppend();

        dirs.length = 0;
        dirs.assumeSafeAppend();

        // This is the callback that ftw() uses.
        extern (C)
        int handler(const char *fpath, const stat_t *sb, int typeflag) {
            const path = fpath.fromStringz.idup;

            switch (typeflag) {
            case TypeFlag.FTW_F:
                entries ~= DirectoryEntry(path, sb.st_size);
                break;

            case TypeFlag.FTW_D:
                dirs ~= path;
                break;

            default:
                import std.stdio;
                writefln!"Ignoring type %s file: %s\n(See 'man nftw')b"(
                    path, typeflag);
                break;
            }

            return 0;
        }

        // The tree walk will be faster up-to this "search depth" (See 'man nftw')
        enum nopenfd = 32;

        const ret = ftw(root.toStringz, &handler, nopenfd);
        enforce(ret == 0,
                format!"Failed walking the directory tree at %s; error: %s"(
                    root, ret));

        string[] nonEmptyDirs = chain(entries.map!(e => e.name),
                                      dirs)
                                .map!dirName
                                .array
                                .sort
                                .uniq
                                .array;
        sort(dirs);

        string[] emptyDirs = setDifference(dirs, nonEmptyDirs)
                             .array;

        return WalkResult(entries.dup, emptyDirs);
    }

    synchronized {
        return impl_();
    }
}

WalkResult directoryWalk_dirEntries(string root) {
    DirectoryEntry[] entries;
    string[] dirs;

    foreach (entry; dirEntries(root, SpanMode.depth)) {
        if (entry.isDir) {
            dirs ~= entry;

        } else {
            entries ~= DirectoryEntry(entry, entry.getSize);
        }
    }

    string[] nonEmptyDirs = chain(entries.map!(e => e.name),
                                  dirs)
                            .map!dirName
                            .array
                            .sort
                            .uniq
                            .array;
    sort(dirs);

    string[] emptyDirs = setDifference(dirs, nonEmptyDirs)
                         .array;

    return WalkResult(entries.dup, emptyDirs);
}

int main(string[] args) {
    import std.datetime.stopwatch;
    import std.stdio;
    import std.path;

    if (args.length != 2) {
        stderr.writefln!"Please provide the directory to walk:\n\n  %s <directory>\n"
            (args[0].baseName);
        return 1;
    }

    const dir = buildNormalizedPath("/home/ali/dlang");

    auto timings = benchmark!({ directoryWalk_ftw(dir); },
                              { directoryWalk_dirEntries(dir); })(10);

    writefln!("ftw       : %s\n" ~
              "dirEntries: %s")(timings[0], timings[1]);

    return 0;
}

Ali

November 11, 2022
On Thursday, 10 November 2022 at 21:27:28 UTC, Ali Çehreli wrote:
> On 11/9/22 12:06, Ali Çehreli wrote:
>
> > I am using its sibling 'ftw'
>
> Now that we know that dirEntries works properly, I decided not to use ftw.
>
> However, ftw performs about twice as fast as dirEntries (despite some common code in the implementation below).

dmd -O compiled patched (see below!) version applied to /usr/bin on my desktop
yields:

ftw       : 363 ms, 750 ÎŒs, and 5 [*]
dirEntries: 18 secs, 831 ms, 738 ÎŒs, and 3 [*]

(* = offending units removed)

> [...]
>     foreach (entry; dirEntries(root, SpanMode.depth)) {
>         if (entry.isDir) {
>             dirs ~= entry;
>
>         } else {
>             entries ~= DirectoryEntry(entry, entry.getSize);
>         }

strace reports that entry.getSize invokes stat on the file a second time. Isn't
the stat buf saved in the entry?

This also gives rise for a complication with symlinks pointing to the directory
which contain them:

   $ pwd
   /tmp/k/sub
   $ ln -s . foo
   $ ../direntrybenchmark .
   std.file.FileException@8[...]/linux/bin64/../../src/phobos/std/file.d(1150): ./foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo: Too many levels of symbolic links
   [...]

> [...]
>     if (args.length != 2) {
>         stderr.writefln!"Please provide the directory to walk:\n\n  %s <directory>\n"
>             (args[0].baseName);
>         return 1;
>     }
>
>     const dir = buildNormalizedPath("/home/ali/dlang");

diff --git a/direntrybenchmark.d b/direntrybenchmark.d
index 661df51..a9a5616 100644
--- a/direntrybenchmark.d
+++ b/direntrybenchmark.d
@@ -102,8 +102,9 @@ WalkResult directoryWalk_dirEntries(string root) {
         if (entry.isDir) {
             dirs ~= entry;

-        } else {
-            entries ~= DirectoryEntry(entry, entry.getSize);
+        }
+        else {
+            entries ~= DirectoryEntry(entry, 0);
         }
     }

@@ -133,7 +134,7 @@ int main(string[] args) {
         return 1;
     }

-    const dir = buildNormalizedPath("/home/ali/dlang");
+    const dir = buildNormalizedPath(args[1]);

     auto timings = benchmark!({ directoryWalk_ftw(dir); },
                               { directoryWalk_dirEntries(dir); })(10);



November 11, 2022
On 11/11/22 05:13, kdevel wrote:

> dmd -O compiled patched (see below!) version applied to /usr/bin on my
> desktop
> yields:
>
> ftw       : 363 ms, 750 ÎŒs, and 5 [*]
> dirEntries: 18 secs, 831 ms, 738 ÎŒs, and 3 [*]

Great. I did not use -O with my test. It may have to do something with the performance of the hard disk.

ftw wins big time. Being just a D binding of a C library function, its compilation should be quick too.

>>             entries ~= DirectoryEntry(entry, entry.getSize);
>>         }
>
> strace reports that entry.getSize invokes stat on the file a second
> time. Isn't
> the stat buf saved in the entry?

That's my bad. entry.size is the cached version of the file size.

> This also gives rise for a complication with symlinks pointing to the
> directory
> which contain them:
>
>     $ pwd
>     /tmp/k/sub
>     $ ln -s . foo
>     $ ../direntrybenchmark .
> std.file.FileException@8[...]/linux/bin64/../../src/phobos/std/file.d(1150): ./foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo/foo: Too many levels of symbolic links

So, ftw does not have that problem? Perhaps because of its default symlink behavior? There is also the more capable nftw, where the caller can specify some flags. And yes, there it is:

 FTW_PHYS
        If set, do not follow symbolic links.  (This is what you  want.)
        If not set, symbolic links are followed, but no file is reported
        twice.

        If FTW_PHYS is not set, but FTW_DEPTH is set, then the  function
        fn()  is never called for a directory that would be a descendant
        of itself.

> -    const dir = buildNormalizedPath("/home/ali/dlang");
> +    const dir = buildNormalizedPath(args[1]);

That one, and I had switched the arguments on the following call. One more example where string interpolation would be useful:

        writefln!"Ignoring type %s file: %s\n(See 'man nftw')b"(
            path, typeflag);

I meant the arguments in the reverse order there.

OT: And there is a 'b' character at the end of that format string which almost certainly appeared when I botched a Ctrl-b command in my editor. :)

Ali

November 11, 2022
On 11/11/22 08:00, Ali Çehreli wrote:

> It may have to do something with the performance of the hard disk.

I meant "the reason you got a much better improvement" may have to do something with the performance differences of your hard disk and mine.

Ali

November 14, 2022
On Friday, 11 November 2022 at 16:00:12 UTC, Ali Çehreli wrote:
> On 11/11/22 05:13, kdevel wrote:
>
> > dmd -O compiled patched (see below!) version applied to
> /usr/bin on my
> > desktop
> > yields:
> >
> > ftw       : 363 ms, 750 ÎŒs, and 5 [*]
> > dirEntries: 18 secs, 831 ms, 738 ÎŒs, and 3 [*]
>
> Great. I did not use -O with my test. It may have to do something with the performance of the hard disk.

It has to do with the large number of symlinks. When I use

   dirEntries(root, SpanMode.depth, false)

the runtime is dramatically reduced and with

   entries ~= DirectoryEntry(entry, entry.size);

the runtimes are

   ftw       : 98 ms, 470 ÎŒs, and 2 *beeep*
   dirEntries: 170 ms, 515 ÎŒs, and 2 *beeep*

(to be continued)

November 14, 2022
On Monday, 14 November 2022 at 21:05:01 UTC, kdevel wrote:
> [...]
> the runtimes are
>
>    ftw       : 98 ms, 470 ÎŒs, and 2 *beeep*
>    dirEntries: 170 ms, 515 ÎŒs, and 2 *beeep*
>
> (to be continued)

When I examine the process with strace it appears that the ftw version gets the whole information from readdir alone. The dirEntries version seems to call lstat on every file (in order to check that it is not a symlink)

Breakpoint 1, 0xf7cc59d4 in lstat64 () from [...]gcc-12.1/lib/libgphobos.so.3
(gdb) bt
#0  0xf7cc59d4 in lstat64 () from [...]gcc-12.1/lib/libgphobos.so.3
#1  0xf7a5269b in std.file.DirEntry._ensureLStatDone() () from [...]gcc-12.1/lib/libgphobos.so.3
#2  0xf7a5276a in std.file.DirEntry.linkAttributes() () from [...]gcc-12.1/lib/libgphobos.so.3
#3  0xf7a528c9 in std.file.DirIteratorImpl.mayStepIn() () from [...]gcc-12.1/lib/libgphobos.so.3
#4  0xf7a545ae in std.file.DirIteratorImpl() () from [...]gcc-12.1/lib/libgphobos.so.3
#5  0xf7a5466e in core.internal.lifetime() () from [...]gcc-12.1/lib/libgphobos.so.3
#6  0xf7a546ef in core.internal.lifetime() () from [...]gcc-12.1/lib/libgphobos.so.3
#7  0xf7a54726 in core.lifetime() () from [...]gcc-12.1/lib/libgphobos.so.3
#8  0xf7a54762 in std.typecons() () from [...]gcc-12.1/lib/libgphobos.so.3
#9  0xf7a547d6 in std.typecons() () from [...]gcc-12.1/lib/libgphobos.so.3
#10 0xf7a54811 in std.file.DirIterator.__ctor() () from [...]gcc-12.1/lib/libgphobos.so.3
#11 0xf7a54882 in std.file.dirEntries() () from [...]gcc-12.1/lib/libgphobos.so.3
#12 0x08088e25 in direntrybenchmark.directoryWalk_dirEntries() (root=..., dump=false) at direntrybenchmark.d:111

and after that an additional stat on the same file in order to check if it is a directory:

Breakpoint 2, 0xf7cc5954 in stat64 () from [...]gcc-12.1/lib/libgphobos.so.3
(gdb) bt
#0  0xf7cc5954 in stat64 () from [...]gcc-12.1/lib/libgphobos.so.3
#1  0xf7a527e1 in std.file.DirEntry._ensureStatOrLStatDone() () from [...]gcc-12.1/lib/libgphobos.so.3
#2  0xf7a5287a in std.file.DirEntry.isDir() () from [...]gcc-12.1/lib/libgphobos.so.3
#3  0x08088e6c in direntrybenchmark.directoryWalk_dirEntries() (root=..., dump=<optimized out>) at direntrybenchmark.d:112
#4  0x08089105 in __lambda7 (__capture=0xf7411000) at direntrybenchmark.d:158
#5  0x0809f8aa in benchmark (n=1, __capture=<optimized out>)
    at /md11/sda2-usr2l/gcc-12.1/lib/gcc/x86_64-pc-linux-gnu/12.1.0/include/d/std/datetime/stopwatch.d:421
#6  __foreachbody9 (__capture=0xf7411000, __applyArg0=..., __applyArg1=...) at direntrybenchmark.d:162
#7  0xf7c98bd9 in _aaApply2 () from [...]gcc-12.1/lib/libgphobos.so.3
#8  0x0808dee1 in direntrybenchmark.main_() (args=...) at direntrybenchmark.d:161


November 26, 2022
On 11/14/22 14:41, kdevel wrote:

> the ftw version gets the whole information from readdir alone.

Created an enhancement request:

  https://issues.dlang.org/show_bug.cgi?id=23512

Ali

November 30, 2022

On Thursday, 10 November 2022 at 21:27:28 UTC, Ali Çehreli wrote:

>

However, ftw performs about twice as fast as dirEntries

Yes, dirEntries isn't as fast as it could be.

Here is a directory iterator which tries to strictly not do more work than what it must:

https://github.com/CyberShadow/ae/blob/86b016fd258ebc26f0da3239a6332c4ebecd3215/sys/file.d#L178

1 2
Next ›   Last »