Fast directory walker

classic Classic list List threaded Threaded
21 messages Options
12
Reply | Threaded
Open this post in threaded view
|

Fast directory walker

Frank Muller
Hi

I would like to improve the speed of my directory walker.

walk(Dir) ->
    {ok, Files} = prim_file:list_dir(Dir),
    walk(Dir, Files).

walk(Dir, [ Basename | Rest ]) ->
    Path = filename:join([ Dir, Basename ]),
    case filelib:is_dir(Path) of
        true  ->
            walk(Path);
        false ->
            io:format("~s~n", [Path]),
            filelib:file_size(Path)
    end,
    walk(Dir, Rest);
walk(_, []) ->
    ok.


Compared to almost anything i found on the web, it’s still very slow:
> timer:tc(fun() -> dir:walk("/usr/share") end).
{<a href="tel:4662361" dir="ltr">4662361,ok}

The idea behind it is to build something similar to The Platinum Searcher (in Go, extremely fast):

Advices very appreciated on how to improve its speed.

/Frank

_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Stanislaw Klekot
On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:
> I would like to improve the speed of my directory walker.
>
> walk(Dir) ->
>     {ok, Files} = prim_file:list_dir(Dir),
>     walk(Dir, Files).

Why prim_file:list_dir() instead of file:list_dir()? The former is
undocumented internal function.

[...]
> Compared to almost anything i found on the web, it’s still very slow:
> > timer:tc(fun() -> dir:walk("/usr/share") end).
> {4662361,ok}

What is it this "anything you found on the web"? And how did you run
your comparisons? There's a large difference between first and second
consequent run caused by OS' directory cache, and there's large
difference between simply walking through the directory and walking with
printing something to the screen for every file.

Then there's also your using filelib:is_dir() and then
filelib:file_size(), which means two stat(2) calls, while you only need
to do it once per file (file:read_file_info()).

--
Stanislaw Klekot
_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Michael Truog
In reply to this post by Frank Muller
On 12/09/2016 03:15 PM, Frank Muller wrote:
Hi

I would like to improve the speed of my directory walker.

walk(Dir) ->
    {ok, Files} = prim_file:list_dir(Dir),
    walk(Dir, Files).

walk(Dir, [ Basename | Rest ]) ->
    Path = filename:join([ Dir, Basename ]),
    case filelib:is_dir(Path) of
        true  ->
            walk(Path);
        false ->
            io:format("~s~n", [Path]),
            filelib:file_size(Path)
    end,
    walk(Dir, Rest);
walk(_, []) ->
    ok.


Compared to almost anything i found on the web, it’s still very slow:
> timer:tc(fun() -> dir:walk("/usr/share") end).
{<a moz-do-not-send="true" href="tel:4662361" dir="ltr">4662361,ok}

Have you tried filelib:fold_files/5 (http://erlang.org/doc/man/filelib.html#fold_files-5) ?


_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Frank Muller
In reply to this post by Stanislaw Klekot
Hi Stanislaw

First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.

And you're right, here is a detailed comparison with other scripting languages:

In my /usr/share, there’s:
2580 directories
28953 files

1. Erlang (no io:format/1, just recurse):

walk(Dir) ->
    {ok, Files} = file:list_dir(Dir),
    walk(Dir, Files).

walk(Dir, [ Basename | Rest ]) ->
    Path = filename:join([ Dir, Basename ]),
    case filelib:is_dir(Path) of
        true  ->
            walk(Path);
        false ->
          %%  io:format("~s~n", [Path]),
            filelib:file_size(Path)
    end,
    walk(Dir, Rest);
walk(_, []) ->
    ok.

timer:tc(fun() -> directoy:walker("/usr/share") end).
{<a href="tel:4662361" dir="ltr">4662361,ok}

2. Python (this code even count the size of dir):

import os
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

print get_size()

$ cd /usr/share
$ time dir_walker.py
<a href="tel:432034130" dir="ltr">432034130
0.25 real         0.13 user         0.10 sys

2. Perl (same, count dir size)

use File::Find;           
my $size = 0;             
find(sub { $size += -s if -f $_ }, "/usr/share");

$ time perl dir_walker.pl
<a href="tel:432034130" dir="ltr">432034130
0.13 real         0.05 user         0.08 sys

3. Ruby (same, count dir size):

def directory_size(path)
  path << '/' unless path.end_with?('/')
  raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
  total_size = 0
  Dir["#{path}**/*"].each do |f|
    total_size += File.size(f) if File.file?(f) && File.size?(f)
  end
  total_size
end
puts directory_size '/usr/share’

$ time walker.rb
<a href="tel:432028422" dir="ltr">432028422
0.21 real         0.09 user         0.11 sys

4. Lua:

require "lfs"

function dirtree(dir)
  assert(dir and dir ~= "", "directory parameter is missing or empty")
  if string.sub(dir, -1) == "/" then
    dir=string.sub(dir, 1, -2)
  end

  local function yieldtree(dir)
    for entry in lfs.dir(dir) do
      if entry ~= "." and entry ~= ".." then
        entry=dir.."/"..entry
local attr=lfs.attributes(entry)
coroutine.yield(entry,attr)
if attr.mode == "directory" then
  yieldtree(entry)
end
      end
    end
  end

  return coroutine.wrap(function() yieldtree(dir) end)
end

for filename, attr in dirtree("/usr/share") do
      print(attr.mode, filename)
end

$ luarocks install luafilesystem
$ time lua walker.lua > /dev/null
0.30 real         0.16 user         0.14 sys

Do you need more?

Thanks for you help.
/Frank

Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <[hidden email]> a écrit :
On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:

> I would like to improve the speed of my directory walker.

>

> walk(Dir) ->

>     {ok, Files} = prim_file:list_dir(Dir),

>     walk(Dir, Files).



Why prim_file:list_dir() instead of file:list_dir()? The former is

undocumented internal function.



[...]

> Compared to almost anything i found on the web, it’s still very slow:

> > timer:tc(fun() -> dir:walk("/usr/share") end).

> {4662361,ok}



What is it this "anything you found on the web"? And how did you run

your comparisons? There's a large difference between first and second

consequent run caused by OS' directory cache, and there's large

difference between simply walking through the directory and walking with

printing something to the screen for every file.



Then there's also your using filelib:is_dir() and then

filelib:file_size(), which means two stat(2) calls, while you only need

to do it once per file (file:read_file_info()).



--

Stanislaw Klekot


_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Benoit Chesneau-2
did you try with the wildcard function? something like:
https://github.com/benoitc/esync/blob/master/src/esync.erl#L151

On Sat, 10 Dec 2016 at 09:29, Frank Muller <[hidden email]> wrote:
Hi Stanislaw

First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.

And you're right, here is a detailed comparison with other scripting languages:

In my /usr/share, there’s:
2580 directories
28953 files

1. Erlang (no io:format/1, just recurse):

walk(Dir) ->
    {ok, Files} = file:list_dir(Dir),
    walk(Dir, Files).

walk(Dir, [ Basename | Rest ]) ->
    Path = filename:join([ Dir, Basename ]),
    case filelib:is_dir(Path) of
        true  ->
            walk(Path);
        false ->
          %%  io:format("~s~n", [Path]),
            filelib:file_size(Path)
    end,
    walk(Dir, Rest);
walk(_, []) ->
    ok.

timer:tc(fun() -> directoy:walker("/usr/share") end).
{<a href="tel:4662361" dir="ltr" class="gmail_msg" target="_blank">4662361,ok}

2. Python (this code even count the size of dir):

import os
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

print get_size()

$ cd /usr/share
$ time dir_walker.py
<a href="tel:432034130" dir="ltr" class="gmail_msg" target="_blank">432034130
0.25 real         0.13 user         0.10 sys

2. Perl (same, count dir size)

use File::Find;           
my $size = 0;             
find(sub { $size += -s if -f $_ }, "/usr/share");

$ time perl dir_walker.pl
<a href="tel:432034130" dir="ltr" class="gmail_msg" target="_blank">432034130
0.13 real         0.05 user         0.08 sys

3. Ruby (same, count dir size):

def directory_size(path)
  path << '/' unless path.end_with?('/')
  raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
  total_size = 0
  Dir["#{path}**/*"].each do |f|
    total_size += File.size(f) if File.file?(f) && File.size?(f)
  end
  total_size
end
puts directory_size '/usr/share’

$ time walker.rb
<a href="tel:432028422" dir="ltr" class="gmail_msg" target="_blank">432028422
0.21 real         0.09 user         0.11 sys

4. Lua:

require "lfs"

function dirtree(dir)
  assert(dir and dir ~= "", "directory parameter is missing or empty")
  if string.sub(dir, -1) == "/" then
    dir=string.sub(dir, 1, -2)
  end

  local function yieldtree(dir)
    for entry in lfs.dir(dir) do
      if entry ~= "." and entry ~= ".." then
        entry=dir.."/"..entry
local attr=lfs.attributes(entry)
coroutine.yield(entry,attr)
if attr.mode == "directory" then
  yieldtree(entry)
end
      end
    end
  end

  return coroutine.wrap(function() yieldtree(dir) end)
end

for filename, attr in dirtree("/usr/share") do
      print(attr.mode, filename)
end

$ luarocks install luafilesystem
$ time lua walker.lua > /dev/null
0.30 real         0.16 user         0.14 sys

Do you need more?

Thanks for you help.
/Frank

Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <[hidden email]> a écrit :
On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:

> I would like to improve the speed of my directory walker.

>

> walk(Dir) ->

>     {ok, Files} = prim_file:list_dir(Dir),

>     walk(Dir, Files).



Why prim_file:list_dir() instead of file:list_dir()? The former is

undocumented internal function.



[...]

> Compared to almost anything i found on the web, it’s still very slow:

> > timer:tc(fun() -> dir:walk("/usr/share") end).

> {4662361,ok}



What is it this "anything you found on the web"? And how did you run

your comparisons? There's a large difference between first and second

consequent run caused by OS' directory cache, and there's large

difference between simply walking through the directory and walking with

printing something to the screen for every file.



Then there's also your using filelib:is_dir() and then

filelib:file_size(), which means two stat(2) calls, while you only need

to do it once per file (file:read_file_info()).



--

Stanislaw Klekot

_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions

_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Sergej Jurečko
In reply to this post by Frank Muller
Stop using filelib functions. Use file:read_file_info and file:list_dir.

Sergej

On Dec 10, 2016 9:29 AM, "Frank Muller" <[hidden email]> wrote:
Hi Stanislaw

First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.

And you're right, here is a detailed comparison with other scripting languages:

In my /usr/share, there’s:
2580 directories
28953 files

1. Erlang (no io:format/1, just recurse):

walk(Dir) ->
    {ok, Files} = file:list_dir(Dir),
    walk(Dir, Files).

walk(Dir, [ Basename | Rest ]) ->
    Path = filename:join([ Dir, Basename ]),
    case filelib:is_dir(Path) of
        true  ->
            walk(Path);
        false ->
          %%  io:format("~s~n", [Path]),
            filelib:file_size(Path)
    end,
    walk(Dir, Rest);
walk(_, []) ->
    ok.

timer:tc(fun() -> directoy:walker("/usr/share") end).
{<a href="tel:4662361" dir="ltr" target="_blank">4662361,ok}

2. Python (this code even count the size of dir):

import os
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

print get_size()

$ cd /usr/share
$ time dir_walker.py
<a href="tel:432034130" dir="ltr" target="_blank">432034130
0.25 real         0.13 user         0.10 sys

2. Perl (same, count dir size)

use File::Find;           
my $size = 0;             
find(sub { $size += -s if -f $_ }, "/usr/share");

$ time perl dir_walker.pl
<a href="tel:432034130" dir="ltr" target="_blank">432034130
0.13 real         0.05 user         0.08 sys

3. Ruby (same, count dir size):

def directory_size(path)
  path << '/' unless path.end_with?('/')
  raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
  total_size = 0
  Dir["#{path}**/*"].each do |f|
    total_size += File.size(f) if File.file?(f) && File.size?(f)
  end
  total_size
end
puts directory_size '/usr/share’

$ time walker.rb
<a href="tel:432028422" dir="ltr" target="_blank">432028422
0.21 real         0.09 user         0.11 sys

4. Lua:

require "lfs"

function dirtree(dir)
  assert(dir and dir ~= "", "directory parameter is missing or empty")
  if string.sub(dir, -1) == "/" then
    dir=string.sub(dir, 1, -2)
  end

  local function yieldtree(dir)
    for entry in lfs.dir(dir) do
      if entry ~= "." and entry ~= ".." then
        entry=dir.."/"..entry
local attr=lfs.attributes(entry)
coroutine.yield(entry,attr)
if attr.mode == "directory" then
  yieldtree(entry)
end
      end
    end
  end

  return coroutine.wrap(function() yieldtree(dir) end)
end

for filename, attr in dirtree("/usr/share") do
      print(attr.mode, filename)
end

$ luarocks install luafilesystem
$ time lua walker.lua > /dev/null
0.30 real         0.16 user         0.14 sys

Do you need more?

Thanks for you help.
/Frank

Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <[hidden email]> a écrit :
On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:

> I would like to improve the speed of my directory walker.

>

> walk(Dir) ->

>     {ok, Files} = prim_file:list_dir(Dir),

>     walk(Dir, Files).



Why prim_file:list_dir() instead of file:list_dir()? The former is

undocumented internal function.



[...]

> Compared to almost anything i found on the web, it’s still very slow:

> > timer:tc(fun() -> dir:walk("/usr/share") end).

> {4662361,ok}



What is it this "anything you found on the web"? And how did you run

your comparisons? There's a large difference between first and second

consequent run caused by OS' directory cache, and there's large

difference between simply walking through the directory and walking with

printing something to the screen for every file.



Then there's also your using filelib:is_dir() and then

filelib:file_size(), which means two stat(2) calls, while you only need

to do it once per file (file:read_file_info()).



--

Stanislaw Klekot


_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions


_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Benoit Chesneau-2
this is kind of bullshit (sorry ;).... at the end this is what does the helpers in filelib:
https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257

except if you have a better algorithm in mind i don't se the point of rewriting something that is aleaready existing ...

On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko <[hidden email]> wrote:
Stop using filelib functions. Use file:read_file_info and file:list_dir.

Sergej

On Dec 10, 2016 9:29 AM, "Frank Muller" <[hidden email]> wrote:
Hi Stanislaw

First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.

And you're right, here is a detailed comparison with other scripting languages:

In my /usr/share, there’s:
2580 directories
28953 files

1. Erlang (no io:format/1, just recurse):

walk(Dir) ->
    {ok, Files} = file:list_dir(Dir),
    walk(Dir, Files).

walk(Dir, [ Basename | Rest ]) ->
    Path = filename:join([ Dir, Basename ]),
    case filelib:is_dir(Path) of
        true  ->
            walk(Path);
        false ->
          %%  io:format("~s~n", [Path]),
            filelib:file_size(Path)
    end,
    walk(Dir, Rest);
walk(_, []) ->
    ok.

timer:tc(fun() -> directoy:walker("/usr/share") end).
{<a href="tel:4662361" dir="ltr" class="gmail_msg" target="_blank">4662361,ok}

2. Python (this code even count the size of dir):

import os
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

print get_size()

$ cd /usr/share
$ time dir_walker.py
<a href="tel:432034130" dir="ltr" class="gmail_msg" target="_blank">432034130
0.25 real         0.13 user         0.10 sys

2. Perl (same, count dir size)

use File::Find;           
my $size = 0;             
find(sub { $size += -s if -f $_ }, "/usr/share");

$ time perl dir_walker.pl
<a href="tel:432034130" dir="ltr" class="gmail_msg" target="_blank">432034130
0.13 real         0.05 user         0.08 sys

3. Ruby (same, count dir size):

def directory_size(path)
  path << '/' unless path.end_with?('/')
  raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
  total_size = 0
  Dir["#{path}**/*"].each do |f|
    total_size += File.size(f) if File.file?(f) && File.size?(f)
  end
  total_size
end
puts directory_size '/usr/share’

$ time walker.rb
<a href="tel:432028422" dir="ltr" class="gmail_msg" target="_blank">432028422
0.21 real         0.09 user         0.11 sys

4. Lua:

require "lfs"

function dirtree(dir)
  assert(dir and dir ~= "", "directory parameter is missing or empty")
  if string.sub(dir, -1) == "/" then
    dir=string.sub(dir, 1, -2)
  end

  local function yieldtree(dir)
    for entry in lfs.dir(dir) do
      if entry ~= "." and entry ~= ".." then
        entry=dir.."/"..entry
local attr=lfs.attributes(entry)
coroutine.yield(entry,attr)
if attr.mode == "directory" then
  yieldtree(entry)
end
      end
    end
  end

  return coroutine.wrap(function() yieldtree(dir) end)
end

for filename, attr in dirtree("/usr/share") do
      print(attr.mode, filename)
end

$ luarocks install luafilesystem
$ time lua walker.lua > /dev/null
0.30 real         0.16 user         0.14 sys

Do you need more?

Thanks for you help.
/Frank

Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <[hidden email]> a écrit :
On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:

> I would like to improve the speed of my directory walker.

>

> walk(Dir) ->

>     {ok, Files} = prim_file:list_dir(Dir),

>     walk(Dir, Files).



Why prim_file:list_dir() instead of file:list_dir()? The former is

undocumented internal function.



[...]

> Compared to almost anything i found on the web, it’s still very slow:

> > timer:tc(fun() -> dir:walk("/usr/share") end).

> {4662361,ok}



What is it this "anything you found on the web"? And how did you run

your comparisons? There's a large difference between first and second

consequent run caused by OS' directory cache, and there's large

difference between simply walking through the directory and walking with

printing something to the screen for every file.



Then there's also your using filelib:is_dir() and then

filelib:file_size(), which means two stat(2) calls, while you only need

to do it once per file (file:read_file_info()).



--

Stanislaw Klekot


_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions

_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions

_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Frank Muller
In reply to this post by Michael Truog
Hi Michael,

Better, but still ~2 seconds:

> timer:tc(fun() -> filelib:fold_files("/usr/share", ".*", true, fun(F, N) -> N + 1 end, 0) end).
{<a href="tel:1993074,28953" dir="ltr">1993074,28953}

If I get it correctly the call matches only on files, not on dirs.

/Frank

Le sam. 10 déc. 2016 à 03:12, Michael Truog <[hidden email]> a écrit :


On 12/09/2016 03:15 PM, Frank Muller

wrote:








Hi







I would like to improve the speed of my directory

walker.







walk(Dir) ->


    {ok, Files} = prim_file:list_dir(Dir),


    walk(Dir, Files).







walk(Dir, [ Basename | Rest ]) ->


    Path = filename:join([ Dir, Basename ]),


    case filelib:is_dir(Path) of


        true  ->


            walk(Path);


        false ->


            io:format("~s~n", [Path]),


            filelib:file_size(Path)


    end,


    walk(Dir, Rest);


walk(_, []) ->


    ok.














Compared

to almost anything i found on the web, it’s still very slow:




> timer:tc(fun() -> dir:walk("/usr/share")

end).




{<a href="tel:4662361" class="gmail_msg" target="_blank">4662361,ok}







Have you tried filelib:fold_files/5

(http://erlang.org/doc/man/filelib.html#fold_files-5) ?










_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Sergej Jurečko
In reply to this post by Benoit Chesneau-2
read_file_info does the job of is_dir and file_size in a single call. That was the intention.

Also use file:read_file_info(name,[raw])

Sergej

On 10 Dec 2016, at 09:42, Benoit Chesneau <[hidden email]> wrote:

this is kind of bullshit (sorry ;).... at the end this is what does the helpers in filelib:
https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257

except if you have a better algorithm in mind i don't se the point of rewriting something that is aleaready existing ...

On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko <[hidden email]> wrote:
Stop using filelib functions. Use file:read_file_info and file:list_dir.

Sergej

On Dec 10, 2016 9:29 AM, "Frank Muller" <[hidden email]> wrote:
Hi Stanislaw

First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.

And you're right, here is a detailed comparison with other scripting languages:

In my /usr/share, there’s:
2580 directories
28953 files

1. Erlang (no io:format/1, just recurse):

walk(Dir) ->
    {ok, Files} = file:list_dir(Dir),
    walk(Dir, Files).

walk(Dir, [ Basename | Rest ]) ->
    Path = filename:join([ Dir, Basename ]),
    case filelib:is_dir(Path) of
        true  ->
            walk(Path);
        false ->
          %%  io:format("~s~n", [Path]),
            filelib:file_size(Path)
    end,
    walk(Dir, Rest);
walk(_, []) ->
    ok.

timer:tc(fun() -> directoy:walker("/usr/share") end).
{<a href="tel:4662361" dir="ltr" class="gmail_msg" target="_blank">4662361,ok}

2. Python (this code even count the size of dir):

import os
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

print get_size()

$ cd /usr/share
$ time dir_walker.py
<a href="tel:432034130" dir="ltr" class="gmail_msg" target="_blank">432034130
0.25 real         0.13 user         0.10 sys

2. Perl (same, count dir size)

use File::Find;           
my $size = 0;             
find(sub { $size += -s if -f $_ }, "/usr/share");

$ time perl dir_walker.pl
<a href="tel:432034130" dir="ltr" class="gmail_msg" target="_blank">432034130
0.13 real         0.05 user         0.08 sys

3. Ruby (same, count dir size):

def directory_size(path)
  path << '/' unless path.end_with?('/')
  raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
  total_size = 0
  Dir["#{path}**/*"].each do |f|
    total_size += File.size(f) if File.file?(f) && File.size?(f)
  end
  total_size
end
puts directory_size '/usr/share’

$ time walker.rb
<a href="tel:432028422" dir="ltr" class="gmail_msg" target="_blank">432028422
0.21 real         0.09 user         0.11 sys

4. Lua:

require "lfs"

function dirtree(dir)
  assert(dir and dir ~= "", "directory parameter is missing or empty")
  if string.sub(dir, -1) == "/" then
    dir=string.sub(dir, 1, -2)
  end

  local function yieldtree(dir)
    for entry in lfs.dir(dir) do
      if entry ~= "." and entry ~= ".." then
        entry=dir.."/"..entry
local attr=lfs.attributes(entry)
coroutine.yield(entry,attr)
if attr.mode == "directory" then
  yieldtree(entry)
end
      end
    end
  end

  return coroutine.wrap(function() yieldtree(dir) end)
end

for filename, attr in dirtree("/usr/share") do
      print(attr.mode, filename)
end

$ luarocks install luafilesystem
$ time lua walker.lua > /dev/null
0.30 real         0.16 user         0.14 sys

Do you need more?

Thanks for you help.
/Frank

Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <[hidden email]> a écrit :
On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:

> I would like to improve the speed of my directory walker.

>

> walk(Dir) ->

>     {ok, Files} = prim_file:list_dir(Dir),

>     walk(Dir, Files).



Why prim_file:list_dir() instead of file:list_dir()? The former is

undocumented internal function.



[...]

> Compared to almost anything i found on the web, it’s still very slow:

> > timer:tc(fun() -> dir:walk("/usr/share") end).

> {4662361,ok}



What is it this "anything you found on the web"? And how did you run

your comparisons? There's a large difference between first and second

consequent run caused by OS' directory cache, and there's large

difference between simply walking through the directory and walking with

printing something to the screen for every file.



Then there's also your using filelib:is_dir() and then

filelib:file_size(), which means two stat(2) calls, while you only need

to do it once per file (file:read_file_info()).



--

Stanislaw Klekot


_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions

_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions


_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Frank Muller
Combining previous hints (Benoit, Sergej):

-module(directory).
-include_lib("kernel/include/file.hrl").
-export([walker/1]).

walker(Path) ->
    case file:read_file_info(Path, [raw]) of
        {ok, #file_info{type = regular}} ->
            1;
        _ -> %% not care about symlink for now, assume a directory
            Children = filelib:wildcard(Path ++ "/*"),
            lists:foldl(fun(P, N) -> N + walker(P) end, 0, Children)
end.

> timer:tc(fun() -> directory:walker("/usr/share") end).
{<a href="tel:1611688,28953" dir="ltr">1611688,<a href="tel:1611688,28953" dir="ltr">28953}

I'm only counting number of files in this case.

/Frank

Le sam. 10 déc. 2016 à 10:05, Sergej Jurečko <[hidden email]> a écrit :
read_file_info does the job of is_dir and file_size in a single call. That was the intention.

Also use file:read_file_info(name,[raw])


Sergej

On 10 Dec 2016, at 09:42, Benoit Chesneau <[hidden email]> wrote:

this is kind of bullshit (sorry ;).... at the end this is what does the helpers in filelib:
https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257

except if you have a better algorithm in mind i don't se the point of rewriting something that is aleaready existing ...

On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko <[hidden email]> wrote:
Stop using filelib functions. Use file:read_file_info and file:list_dir.

Sergej

On Dec 10, 2016 9:29 AM, "Frank Muller" <[hidden email]> wrote:
Hi Stanislaw

First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.

And you're right, here is a detailed comparison with other scripting languages:

In my /usr/share, there’s:
2580 directories
28953 files

1. Erlang (no io:format/1, just recurse):

walk(Dir) ->
    {ok, Files} = file:list_dir(Dir),
    walk(Dir, Files).

walk(Dir, [ Basename | Rest ]) ->
    Path = filename:join([ Dir, Basename ]),
    case filelib:is_dir(Path) of
        true  ->
            walk(Path);
        false ->
          %%  io:format("~s~n", [Path]),
            filelib:file_size(Path)
    end,
    walk(Dir, Rest);
walk(_, []) ->
    ok.

timer:tc(fun() -> directoy:walker("/usr/share") end).
{<a href="tel:4662361" class="gmail_msg" target="_blank">4662361,ok}

2. Python (this code even count the size of dir):

import os
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

print get_size()

$ cd /usr/share
$ time dir_walker.py
<a href="tel:432034130" class="gmail_msg" target="_blank">432034130
0.25 real         0.13 user         0.10 sys

2. Perl (same, count dir size)

use File::Find;           
my $size = 0;             
find(sub { $size += -s if -f $_ }, "/usr/share");

$ time perl dir_walker.pl
<a href="tel:432034130" class="gmail_msg" target="_blank">432034130
0.13 real         0.05 user         0.08 sys

3. Ruby (same, count dir size):

def directory_size(path)
  path << '/' unless path.end_with?('/')
  raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
  total_size = 0
  Dir["#{path}**/*"].each do |f|
    total_size += File.size(f) if File.file?(f) && File.size?(f)
  end
  total_size
end
puts directory_size '/usr/share’

$ time walker.rb
<a href="tel:432028422" class="gmail_msg" target="_blank">432028422
0.21 real         0.09 user         0.11 sys

4. Lua:

require "lfs"

function dirtree(dir)
  assert(dir and dir ~= "", "directory parameter is missing or empty")
  if string.sub(dir, -1) == "/" then
    dir=string.sub(dir, 1, -2)
  end

  local function yieldtree(dir)
    for entry in lfs.dir(dir) do
      if entry ~= "." and entry ~= ".." then
        entry=dir.."/"..entry
local attr=lfs.attributes(entry)
coroutine.yield(entry,attr)
if attr.mode == "directory" then
  yieldtree(entry)
end
      end
    end
  end

  return coroutine.wrap(function() yieldtree(dir) end)
end

for filename, attr in dirtree("/usr/share") do
      print(attr.mode, filename)
end

$ luarocks install luafilesystem
$ time lua walker.lua > /dev/null
0.30 real         0.16 user         0.14 sys

Do you need more?

Thanks for you help.
/Frank

Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <[hidden email]> a écrit :
On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:

> I would like to improve the speed of my directory walker.

>

> walk(Dir) ->

>     {ok, Files} = prim_file:list_dir(Dir),

>     walk(Dir, Files).



Why prim_file:list_dir() instead of file:list_dir()? The former is

undocumented internal function.



[...]

> Compared to almost anything i found on the web, it’s still very slow:

> > timer:tc(fun() -> dir:walk("/usr/share") end).

> {4662361,ok}



What is it this "anything you found on the web"? And how did you run

your comparisons? There's a large difference between first and second

consequent run caused by OS' directory cache, and there's large

difference between simply walking through the directory and walking with

printing something to the screen for every file.



Then there's also your using filelib:is_dir() and then

filelib:file_size(), which means two stat(2) calls, while you only need

to do it once per file (file:read_file_info()).



--

Stanislaw Klekot




_______________________________________________


erlang-questions mailing list


[hidden email]


http://erlang.org/mailman/listinfo/erlang-questions





_______________________________________________


erlang-questions mailing list


[hidden email]


http://erlang.org/mailman/listinfo/erlang-questions






_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Michael Truog
In reply to this post by Frank Muller
On 12/10/2016 12:43 AM, Frank Muller wrote:
Hi Michael,

Better, but still ~2 seconds:

> timer:tc(fun() -> filelib:fold_files("/usr/share", ".*", true, fun(F, N) -> N + 1 end, 0) end).
{<a moz-do-not-send="true" href="tel:1993074,28953" dir="ltr">1993074,28953}

If I get it correctly the call matches only on files, not on dirs.
Yes, the regex only matches on filenames, not directory names.  Currently, the filelib:fold_files/5 function requires a regex so you are likely paying a penalty for it matching on a wildcard regex on each filename.  So, it would probably be nice to have a filelib:fold_files/4 that doesn't require a regex.

Best Regards,
Michael


/Frank

Le sam. 10 déc. 2016 à 03:12, Michael Truog <[hidden email]> a écrit :


On 12/09/2016 03:15 PM, Frank Muller

wrote:








Hi







I would like to improve the speed of my directory

walker.







walk(Dir) ->


    {ok, Files} = prim_file:list_dir(Dir),


    walk(Dir, Files).







walk(Dir, [ Basename | Rest ]) ->


    Path = filename:join([ Dir, Basename ]),


    case filelib:is_dir(Path) of


        true  ->


            walk(Path);


        false ->


            io:format("~s~n", [Path]),


            filelib:file_size(Path)


    end,


    walk(Dir, Rest);


walk(_, []) ->


    ok.














Compared

to almost anything i found on the web, it’s still very slow:




> timer:tc(fun() -> dir:walk("/usr/share")

end).




{<a moz-do-not-send="true" href="tel:4662361" class="gmail_msg" target="_blank">4662361,ok}







Have you tried filelib:fold_files/5

(http://erlang.org/doc/man/filelib.html#fold_files-5) ?











_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Fred Youhanaie-2
In reply to this post by Frank Muller

Out of interest, what do you get when you run the same benchmark twice in succession?

The buffer cache would play a role on the second and subsequent attempts. For your code below, running on my oldish laptop, I get {35667060,158949} and {8606920,158949}.

You can use "blockdev --flushbufs <device>" to ensure the buffers are clean before running the benchmarks, especially when comparing different versions and languages.

Having said that, the erlang version does look slow, especially when compared with the shell equivalent "time find /usr/share -print | wc -l"

$ sudo blockdev --flushbufs /dev/...

$ time find /usr/share -print | wc -l
186911

real 0m32.446s
user 0m0.796s
sys 0m1.808s

$ time find /usr/share -print | wc -l
186911

real 0m0.336s
user 0m0.152s
sys 0m0.200s


Perhaps there is room for improvement within the library itself!

Cheers,
f.


On 10/12/16 09:20, Frank Muller wrote:

> Combining previous hints (Benoit, Sergej):
>
> -module(directory).
> -include_lib("kernel/include/file.hrl").
> -export([walker/1]).
>
> walker(Path) ->
>     case file:read_file_info(Path, [raw]) of
>         {ok, #file_info{type = regular}} ->
>             1;
>         _ -> %% not care about symlink for now, assume a directory
>             Children = filelib:wildcard(Path ++ "/*"),
>             lists:foldl(fun(P, N) -> N + walker(P) end, 0, Children)
> end.
>
>> timer:tc(fun() -> directory:walker("/usr/share") end).
> {1611688, <tel:1611688,28953>/28953 <tel:1611688,28953>/}
>
> I'm only counting number of files in this case.
>
> /Frank
>
> Le sam. 10 déc. 2016 à 10:05, Sergej Jurečko <[hidden email] <mailto:[hidden email]>> a écrit :
>
>     read_file_info does the job of is_dir and file_size in a single call. That was the intention.
>
>     Also use file:read_file_info(name,[raw])
>
>
>     Sergej
>
>>     On 10 Dec 2016, at 09:42, Benoit Chesneau <[hidden email] <mailto:[hidden email]>> wrote:
>>
>>     this is kind of bullshit (sorry ;).... at the end this is what does the helpers in filelib:
>>     https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257
>>
>>     except if you have a better algorithm in mind i don't se the point of rewriting something that is aleaready existing ...
>>
>>     On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko <[hidden email] <mailto:[hidden email]>> wrote:
>>
>>         Stop using filelib functions. Use file:read_file_info and file:list_dir.
>>
>>         Sergej
>>
>>         On Dec 10, 2016 9:29 AM, "Frank Muller" <[hidden email] <mailto:[hidden email]>> wrote:
>>
>>             Hi Stanislaw
>>
>>             First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.
>>
>>             And you're right, here is a detailed comparison with other scripting languages:
>>
>>             In my /usr/share, there’s:
>>             2580 directories
>>             28953 files
>>
>>             1. Erlang (no io:format/1, just recurse):
>>
>>             walk(Dir) ->
>>                 {ok, Files} = file:list_dir(Dir),
>>                 walk(Dir, Files).
>>
>>             walk(Dir, [ Basename | Rest ]) ->
>>                 Path = filename:join([ Dir, Basename ]),
>>                 case filelib:is_dir(Path) of
>>                     true  ->
>>                         walk(Path);
>>                     false ->
>>                       %%  io:format("~s~n", [Path]),
>>                         filelib:file_size(Path)
>>                 end,
>>                 walk(Dir, Rest);
>>             walk(_, []) ->
>>                 ok.
>>
>>             timer:tc(fun() -> directoy:walker("/usr/share") end).
>>             {4662361 <tel:4662361>,ok}
>>
>>             2. Python (this code even count the size of dir):
>>             From: http://stackoverflow.com/questions/1392413/calculating-a-directory-size-using-python
>>
>>             import os
>>             def get_size(start_path = '.'):
>>                 total_size = 0
>>                 for dirpath, dirnames, filenames in os.walk(start_path):
>>                     for f in filenames:
>>                         fp = os.path.join(dirpath, f)
>>                         total_size += os.path.getsize(fp)
>>                 return total_size
>>
>>             print get_size()
>>
>>             $ cd /usr/share
>>             $ time dir_walker.py
>>             432034130 <tel:432034130>
>>             0.25 real         0.13 user         0.10 sys
>>
>>             2. Perl (same, count dir size)
>>             http://www.perlmonks.org/?node_id=168974
>>
>>             use File::Find;
>>             my $size = 0;
>>             find(sub { $size += -s if -f $_ }, "/usr/share");
>>
>>             $ time perl dir_walker.pl <http://dir_walker.pl/>
>>             432034130 <tel:432034130>
>>             0.13 real         0.05 user         0.08 sys
>>
>>             3. Ruby (same, count dir size):
>>
>>             def directory_size(path)
>>               path << '/' unless path.end_with?('/')
>>               raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
>>               total_size = 0
>>               Dir["#{path}**/*"].each do |f|
>>                 total_size += File.size(f) if File.file?(f) && File.size?(f)
>>               end
>>               total_size
>>             end
>>             puts directory_size '/usr/share’
>>
>>             $ time walker.rb
>>             432028422 <tel:432028422>
>>             0.21 real         0.09 user         0.11 sys
>>
>>             4. Lua:
>>             From: http://lua-users.org/wiki/DirTreeIterator
>>
>>             require "lfs"
>>
>>             function dirtree(dir)
>>               assert(dir and dir ~= "", "directory parameter is missing or empty")
>>               if string.sub(dir, -1) == "/" then
>>                 dir=string.sub(dir, 1, -2)
>>               end
>>
>>               local function yieldtree(dir)
>>                 for entry in lfs.dir(dir) do
>>                   if entry ~= "." and entry ~= ".." then
>>                     entry=dir.."/"..entry
>>             local attr=lfs.attributes(entry)
>>             coroutine.yield(entry,attr)
>>             if attr.mode == "directory" then
>>               yieldtree(entry)
>>             end
>>                   end
>>                 end
>>               end
>>
>>               return coroutine.wrap(function() yieldtree(dir) end)
>>             end
>>
>>             for filename, attr in dirtree("/usr/share") do
>>                   print(attr.mode, filename)
>>             end
>>
>>             $ luarocks install luafilesystem
>>             $ time lua walker.lua > /dev/null
>>             0.30 real         0.16 user         0.14 sys
>>
>>             Do you need more?
>>
>>             Thanks for you help.
>>             /Frank
>>
>>             Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <[hidden email] <mailto:[hidden email]>> a écrit :
>>
>>                 On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:
>>
>>                 > I would like to improve the speed of my directory walker.
>>
>>                 >
>>
>>                 > walk(Dir) ->
>>
>>                 >     {ok, Files} = prim_file:list_dir(Dir),
>>
>>                 >     walk(Dir, Files).
>>
>>
>>
>>                 Why prim_file:list_dir() instead of file:list_dir()? The former is
>>
>>                 undocumented internal function.
>>
>>
>>
>>                 [...]
>>
>>                 > Compared to almost anything i found on the web, it’s still very slow:
>>
>>                 > > timer:tc(fun() -> dir:walk("/usr/share") end).
>>
>>                 > {4662361,ok}
>>
>>
>>
>>                 What is it this "anything you found on the web"? And how did you run
>>
>>                 your comparisons? There's a large difference between first and second
>>
>>                 consequent run caused by OS' directory cache, and there's large
>>
>>                 difference between simply walking through the directory and walking with
>>
>>                 printing something to the screen for every file.
>>
>>
>>
>>                 Then there's also your using filelib:is_dir() and then
>>
>>                 filelib:file_size(), which means two stat(2) calls, while you only need
>>
>>                 to do it once per file (file:read_file_info()).
>>
>>
>>
>>                 --
>>
>>                 Stanislaw Klekot
>>
>>
_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Sergej Jurečko
In reply to this post by Frank Muller
A faster version that returns size of folder.  Pth++"/"++H  is faster than filename:join, and directly calling prim_file is also faster. 

walker1(Path) ->
case prim_file:list_dir(Path) of
{ok,L} ->
walker1(Path,L,0);
_ ->
0
end.
walker1(Pth,["."|T],Sz) ->
walker1(Pth,T,Sz);
walker1(Pth,[".."|T],Sz) ->
walker1(Pth,T,Sz);
walker1(Pth,[H|T],Sz) ->
Nm = Pth++"/"++H,
case prim_file:read_file_info(Nm) of
{ok,#file_info{type = regular, size = FS}} ->
walker1(Pth,T,Sz+FS);
{ok,#file_info{type = directory}} ->
case prim_file:list_dir(Nm) of
{ok,L} ->
walker1(Pth, T, walker1(Nm,L,Sz));
_ ->
walker1(Pth, T, Sz)
end;
_ ->
walker1(Pth,T,Sz)
end;
walker1(_,[],Sz) ->
Sz.

On 10 Dec 2016, at 10:20, Frank Muller <[hidden email]> wrote:

Combining previous hints (Benoit, Sergej):

-module(directory).
-include_lib("kernel/include/file.hrl").
-export([walker/1]).

walker(Path) ->
    case file:read_file_info(Path, [raw]) of
        {ok, #file_info{type = regular}} ->
            1;
        _ -> %% not care about symlink for now, assume a directory
            Children = filelib:wildcard(Path ++ "/*"),
            lists:foldl(fun(P, N) -> N + walker(P) end, 0, Children)
end.

> timer:tc(fun() -> directory:walker("/usr/share") end).
{<a href="tel:1611688,28953" dir="ltr" class="">1611688,<a href="tel:1611688,28953" dir="ltr" class="">28953}

I'm only counting number of files in this case.

/Frank

Le sam. 10 déc. 2016 à 10:05, Sergej Jurečko <[hidden email]> a écrit :
read_file_info does the job of is_dir and file_size in a single call. That was the intention.

Also use file:read_file_info(name,[raw])


Sergej

On 10 Dec 2016, at 09:42, Benoit Chesneau <[hidden email]> wrote:

this is kind of bullshit (sorry ;).... at the end this is what does the helpers in filelib:
https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257

except if you have a better algorithm in mind i don't se the point of rewriting something that is aleaready existing ...

On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko <[hidden email]> wrote:
Stop using filelib functions. Use file:read_file_info and file:list_dir.

Sergej

On Dec 10, 2016 9:29 AM, "Frank Muller" <[hidden email]> wrote:
Hi Stanislaw

First, I don't care if I've to use documented/undocumented calls as long as I can achieve my goal: faster dir walking.

And you're right, here is a detailed comparison with other scripting languages:

In my /usr/share, there’s:
2580 directories
28953 files

1. Erlang (no io:format/1, just recurse):

walk(Dir) ->
    {ok, Files} = file:list_dir(Dir),
    walk(Dir, Files).

walk(Dir, [ Basename | Rest ]) ->
    Path = filename:join([ Dir, Basename ]),
    case filelib:is_dir(Path) of
        true  ->
            walk(Path);
        false ->
          %%  io:format("~s~n", [Path]),
            filelib:file_size(Path)
    end,
    walk(Dir, Rest);
walk(_, []) ->
    ok.

timer:tc(fun() -> directoy:walker("/usr/share") end).
{<a href="tel:4662361" class="gmail_msg" target="_blank">4662361,ok}

2. Python (this code even count the size of dir):

import os
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

print get_size()

$ cd /usr/share
$ time dir_walker.py
<a href="tel:432034130" class="gmail_msg" target="_blank">432034130
0.25 real         0.13 user         0.10 sys

2. Perl (same, count dir size)

use File::Find;           
my $size = 0;             
find(sub { $size += -s if -f $_ }, "/usr/share");

$ time perl dir_walker.pl
<a href="tel:432034130" class="gmail_msg" target="_blank">432034130
0.13 real         0.05 user         0.08 sys

3. Ruby (same, count dir size):

def directory_size(path)
  path << '/' unless path.end_with?('/')
  raise RuntimeError, "#{path} is not a directory" unless File.directory?(path)
  total_size = 0
  Dir["#{path}**/*"].each do |f|
    total_size += File.size(f) if File.file?(f) && File.size?(f)
  end
  total_size
end
puts directory_size '/usr/share’

$ time walker.rb
<a href="tel:432028422" class="gmail_msg" target="_blank">432028422
0.21 real         0.09 user         0.11 sys

4. Lua:

require "lfs"

function dirtree(dir)
  assert(dir and dir ~= "", "directory parameter is missing or empty")
  if string.sub(dir, -1) == "/" then
    dir=string.sub(dir, 1, -2)
  end

  local function yieldtree(dir)
    for entry in lfs.dir(dir) do
      if entry ~= "." and entry ~= ".." then
        entry=dir.."/"..entry
local attr=lfs.attributes(entry)
coroutine.yield(entry,attr)
if attr.mode == "directory" then
  yieldtree(entry)
end
      end
    end
  end

  return coroutine.wrap(function() yieldtree(dir) end)
end

for filename, attr in dirtree("/usr/share") do
      print(attr.mode, filename)
end

$ luarocks install luafilesystem
$ time lua walker.lua > /dev/null
0.30 real         0.16 user         0.14 sys

Do you need more?

Thanks for you help.
/Frank

Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot <[hidden email]> a écrit :
On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:

> I would like to improve the speed of my directory walker.

>

> walk(Dir) ->

>     {ok, Files} = prim_file:list_dir(Dir),

>     walk(Dir, Files).



Why prim_file:list_dir() instead of file:list_dir()? The former is

undocumented internal function.



[...]

> Compared to almost anything i found on the web, it’s still very slow:

> > timer:tc(fun() -> dir:walk("/usr/share") end).

> {4662361,ok}



What is it this "anything you found on the web"? And how did you run

your comparisons? There's a large difference between first and second

consequent run caused by OS' directory cache, and there's large

difference between simply walking through the directory and walking with

printing something to the screen for every file.



Then there's also your using filelib:is_dir() and then

filelib:file_size(), which means two stat(2) calls, while you only need

to do it once per file (file:read_file_info()).



--

Stanislaw Klekot




_______________________________________________


erlang-questions mailing list


[hidden email]


http://erlang.org/mailman/listinfo/erlang-questions





_______________________________________________


erlang-questions mailing list


[hidden email]


http://erlang.org/mailman/listinfo/erlang-questions







_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Max Lapshin-2
All these discussions about faster or slower are not considering very important thing:  all these operations are going through singleton process file_server.  All this speed will become nothing on single core.


This hackish way:

{ok, H} = prim_file:start(),
{ok, Entries} = prim_file:list_dir(H, Path)


works on multicore

_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Joe Armstrong-2
In reply to this post by Fred Youhanaie-2
This is very interesting - I've often wondered why directory traversal
is faster in C than
Erlang since all Erlang is doing is calling C primitives -

I think measuring the times for this is difficult - if for example you
run a python test immediately followed by an erlang test, I'd expect
you get a different result than if you run the erlang
test with a "cold" cache - running the python program will have the
side effect of loading
various memory buffers - I'd also expect different results on
different OS and different
results depending upon your physical memory sizes and the size of the
trees you are traversing.

I've often wondered if fast C programs achieve their speed by directly
poking around in the
underlying inode structures, but this would involve detailed knowledge
of the underlying
file system (ie is an ext3, HFS+, FAT32 etc.)

Programs like rsync and the Dropbox sync algorithm seem unreasonably
fast to me -
I wonder if they poke around in the underlying OS representation of
the file system and not
use a more portable and easier to use interface.

/Joe




On Sat, Dec 10, 2016 at 11:33 AM, Fred Youhanaie <[hidden email]> wrote:
>
> Out of interest, what do you get when you run the same benchmark twice in
> succession?



>
> The buffer cache would play a role on the second and subsequent attempts.
> For your code below, running on my oldish laptop, I get {35667060,158949}
> and {8606920,158949}.
>
> You can use "blockdev --flushbufs <device>" to ensure the buffers are clean
> before running the benchmarks, especially when comparing different versions
> and languages.
>
> Having said that, the erlang version does look slow, especially when
> compared with the shell equivalent "time find /usr/share -print | wc -l"
>
> $ sudo blockdev --flushbufs /dev/...
>
> $ time find /usr/share -print | wc -l
> 186911
>
> real    0m32.446s
> user    0m0.796s
> sys     0m1.808s
>
> $ time find /usr/share -print | wc -l
> 186911
>
> real    0m0.336s
> user    0m0.152s
> sys     0m0.200s
>
>
> Perhaps there is room for improvement within the library itself!
>
> Cheers,
> f.
>
>
> On 10/12/16 09:20, Frank Muller wrote:
>>
>> Combining previous hints (Benoit, Sergej):
>>
>> -module(directory).
>> -include_lib("kernel/include/file.hrl").
>> -export([walker/1]).
>>
>> walker(Path) ->
>>     case file:read_file_info(Path, [raw]) of
>>         {ok, #file_info{type = regular}} ->
>>             1;
>>         _ -> %% not care about symlink for now, assume a directory
>>             Children = filelib:wildcard(Path ++ "/*"),
>>             lists:foldl(fun(P, N) -> N + walker(P) end, 0, Children)
>> end.
>>
>>> timer:tc(fun() -> directory:walker("/usr/share") end).
>>
>> {1611688, <tel:1611688,28953>/28953 <tel:1611688,28953>/}
>>
>> I'm only counting number of files in this case.
>>
>> /Frank
>>
>> Le sam. 10 déc. 2016 à 10:05, Sergej Jurečko <[hidden email]
>> <mailto:[hidden email]>> a écrit :
>>
>>     read_file_info does the job of is_dir and file_size in a single call.
>> That was the intention.
>>
>>     Also use file:read_file_info(name,[raw])
>>
>>
>>     Sergej
>>
>>>     On 10 Dec 2016, at 09:42, Benoit Chesneau <[hidden email]
>>> <mailto:[hidden email]>> wrote:
>>>
>>>     this is kind of bullshit (sorry ;).... at the end this is what does
>>> the helpers in filelib:
>>>
>>> https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257
>>>
>>>     except if you have a better algorithm in mind i don't se the point of
>>> rewriting something that is aleaready existing ...
>>>
>>>     On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko
>>> <[hidden email] <mailto:[hidden email]>> wrote:
>>>
>>>         Stop using filelib functions. Use file:read_file_info and
>>> file:list_dir.
>>>
>>>         Sergej
>>>
>>>         On Dec 10, 2016 9:29 AM, "Frank Muller"
>>> <[hidden email] <mailto:[hidden email]>> wrote:
>>>
>>>             Hi Stanislaw
>>>
>>>             First, I don't care if I've to use documented/undocumented
>>> calls as long as I can achieve my goal: faster dir walking.
>>>
>>>             And you're right, here is a detailed comparison with other
>>> scripting languages:
>>>
>>>             In my /usr/share, there’s:
>>>             2580 directories
>>>             28953 files
>>>
>>>             1. Erlang (no io:format/1, just recurse):
>>>
>>>             walk(Dir) ->
>>>                 {ok, Files} = file:list_dir(Dir),
>>>                 walk(Dir, Files).
>>>
>>>             walk(Dir, [ Basename | Rest ]) ->
>>>                 Path = filename:join([ Dir, Basename ]),
>>>                 case filelib:is_dir(Path) of
>>>                     true  ->
>>>                         walk(Path);
>>>                     false ->
>>>                       %%  io:format("~s~n", [Path]),
>>>                         filelib:file_size(Path)
>>>                 end,
>>>                 walk(Dir, Rest);
>>>             walk(_, []) ->
>>>                 ok.
>>>
>>>             timer:tc(fun() -> directoy:walker("/usr/share") end).
>>>             {4662361 <tel:4662361>,ok}
>>>
>>>             2. Python (this code even count the size of dir):
>>>             From:
>>> http://stackoverflow.com/questions/1392413/calculating-a-directory-size-using-python
>>>
>>>             import os
>>>             def get_size(start_path = '.'):
>>>                 total_size = 0
>>>                 for dirpath, dirnames, filenames in os.walk(start_path):
>>>                     for f in filenames:
>>>                         fp = os.path.join(dirpath, f)
>>>                         total_size += os.path.getsize(fp)
>>>                 return total_size
>>>
>>>             print get_size()
>>>
>>>             $ cd /usr/share
>>>             $ time dir_walker.py
>>>             432034130 <tel:432034130>
>>>             0.25 real         0.13 user         0.10 sys
>>>
>>>             2. Perl (same, count dir size)
>>>             http://www.perlmonks.org/?node_id=168974
>>>
>>>             use File::Find;
>>>             my $size = 0;
>>>             find(sub { $size += -s if -f $_ }, "/usr/share");
>>>
>>>             $ time perl dir_walker.pl <http://dir_walker.pl/>
>>>             432034130 <tel:432034130>
>>>             0.13 real         0.05 user         0.08 sys
>>>
>>>             3. Ruby (same, count dir size):
>>>
>>>             def directory_size(path)
>>>               path << '/' unless path.end_with?('/')
>>>               raise RuntimeError, "#{path} is not a directory" unless
>>> File.directory?(path)
>>>               total_size = 0
>>>               Dir["#{path}**/*"].each do |f|
>>>                 total_size += File.size(f) if File.file?(f) &&
>>> File.size?(f)
>>>               end
>>>               total_size
>>>             end
>>>             puts directory_size '/usr/share’
>>>
>>>             $ time walker.rb
>>>             432028422 <tel:432028422>
>>>
>>>             0.21 real         0.09 user         0.11 sys
>>>
>>>             4. Lua:
>>>             From: http://lua-users.org/wiki/DirTreeIterator
>>>
>>>             require "lfs"
>>>
>>>             function dirtree(dir)
>>>               assert(dir and dir ~= "", "directory parameter is missing
>>> or empty")
>>>               if string.sub(dir, -1) == "/" then
>>>                 dir=string.sub(dir, 1, -2)
>>>               end
>>>
>>>               local function yieldtree(dir)
>>>                 for entry in lfs.dir(dir) do
>>>                   if entry ~= "." and entry ~= ".." then
>>>                     entry=dir.."/"..entry
>>>             local attr=lfs.attributes(entry)
>>>             coroutine.yield(entry,attr)
>>>             if attr.mode == "directory" then
>>>               yieldtree(entry)
>>>             end
>>>                   end
>>>                 end
>>>               end
>>>
>>>               return coroutine.wrap(function() yieldtree(dir) end)
>>>             end
>>>
>>>             for filename, attr in dirtree("/usr/share") do
>>>                   print(attr.mode, filename)
>>>             end
>>>
>>>             $ luarocks install luafilesystem
>>>             $ time lua walker.lua > /dev/null
>>>             0.30 real         0.16 user         0.14 sys
>>>
>>>             Do you need more?
>>>
>>>             Thanks for you help.
>>>             /Frank
>>>
>>>             Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot
>>> <[hidden email] <mailto:[hidden email]>> a écrit :
>>>
>>>                 On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller
>>> wrote:
>>>
>>>                 > I would like to improve the speed of my directory
>>> walker.
>>>
>>>                 >
>>>
>>>                 > walk(Dir) ->
>>>
>>>                 >     {ok, Files} = prim_file:list_dir(Dir),
>>>
>>>                 >     walk(Dir, Files).
>>>
>>>
>>>
>>>                 Why prim_file:list_dir() instead of file:list_dir()? The
>>> former is
>>>
>>>                 undocumented internal function.
>>>
>>>
>>>
>>>                 [...]
>>>
>>>                 > Compared to almost anything i found on the web, it’s
>>> still very slow:
>>>
>>>                 > > timer:tc(fun() -> dir:walk("/usr/share") end).
>>>
>>>                 > {4662361,ok}
>>>
>>>
>>>
>>>                 What is it this "anything you found on the web"? And how
>>> did you run
>>>
>>>                 your comparisons? There's a large difference between
>>> first and second
>>>
>>>                 consequent run caused by OS' directory cache, and there's
>>> large
>>>
>>>                 difference between simply walking through the directory
>>> and walking with
>>>
>>>                 printing something to the screen for every file.
>>>
>>>
>>>
>>>                 Then there's also your using filelib:is_dir() and then
>>>
>>>                 filelib:file_size(), which means two stat(2) calls, while
>>> you only need
>>>
>>>                 to do it once per file (file:read_file_info()).
>>>
>>>
>>>
>>>                 --
>>>
>>>                 Stanislaw Klekot
>>>
>>>
> _______________________________________________
> erlang-questions mailing list
> [hidden email]
> http://erlang.org/mailman/listinfo/erlang-questions
_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Frank Muller
In reply to this post by Max Lapshin-2
Max,

Very interesting.

What's the benefit of starting this prim_file server?
Does it make avoid the singleton process "file_server"?

I'll bench this hack with Sergeij's version and post the result here. 

/Frank

Le sam. 10 déc. 2016 à 12:02, Max Lapshin <[hidden email]> a écrit :
All these discussions about faster or slower are not considering very important thing:  all these operations are going through singleton process file_server.  All this speed will become nothing on single core.


This hackish way:

{ok, H} = prim_file:start(),
{ok, Entries} = prim_file:list_dir(H, Path)


works on multicore



_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Fred Youhanaie-2
In reply to this post by Joe Armstrong-2
I just had a look at the GNU find source, it uses fts, man fts(3). I didn't bother digging deeper into glibc to see how fts is handling the inodes.

Frank's earlier non-Erlang examples seem to be using dedicated file hierarchy walker modules, perhaps those are using the fts, or similar, C functions. In which case they are being compared with the
pure erlang implementation of the tree walker.

Staying with pure erlang version, a concurrent/multi-process tree walker could be an interesting project in itself :)

Cheers,
f.


On 10/12/16 11:25, Joe Armstrong wrote:

> This is very interesting - I've often wondered why directory traversal
> is faster in C than
> Erlang since all Erlang is doing is calling C primitives -
>
> I think measuring the times for this is difficult - if for example you
> run a python test immediately followed by an erlang test, I'd expect
> you get a different result than if you run the erlang
> test with a "cold" cache - running the python program will have the
> side effect of loading
> various memory buffers - I'd also expect different results on
> different OS and different
> results depending upon your physical memory sizes and the size of the
> trees you are traversing.
>
> I've often wondered if fast C programs achieve their speed by directly
> poking around in the
> underlying inode structures, but this would involve detailed knowledge
> of the underlying
> file system (ie is an ext3, HFS+, FAT32 etc.)
>
> Programs like rsync and the Dropbox sync algorithm seem unreasonably
> fast to me -
> I wonder if they poke around in the underlying OS representation of
> the file system and not
> use a more portable and easier to use interface.
>
> /Joe
>
>
>
>
> On Sat, Dec 10, 2016 at 11:33 AM, Fred Youhanaie <[hidden email]> wrote:
>>
>> Out of interest, what do you get when you run the same benchmark twice in
>> succession?
>
>
>
>>
>> The buffer cache would play a role on the second and subsequent attempts.
>> For your code below, running on my oldish laptop, I get {35667060,158949}
>> and {8606920,158949}.
>>
>> You can use "blockdev --flushbufs <device>" to ensure the buffers are clean
>> before running the benchmarks, especially when comparing different versions
>> and languages.
>>
>> Having said that, the erlang version does look slow, especially when
>> compared with the shell equivalent "time find /usr/share -print | wc -l"
>>
>> $ sudo blockdev --flushbufs /dev/...
>>
>> $ time find /usr/share -print | wc -l
>> 186911
>>
>> real    0m32.446s
>> user    0m0.796s
>> sys     0m1.808s
>>
>> $ time find /usr/share -print | wc -l
>> 186911
>>
>> real    0m0.336s
>> user    0m0.152s
>> sys     0m0.200s
>>
>>
>> Perhaps there is room for improvement within the library itself!
>>
>> Cheers,
>> f.
>>
>>
>> On 10/12/16 09:20, Frank Muller wrote:
>>>
>>> Combining previous hints (Benoit, Sergej):
>>>
>>> -module(directory).
>>> -include_lib("kernel/include/file.hrl").
>>> -export([walker/1]).
>>>
>>> walker(Path) ->
>>>     case file:read_file_info(Path, [raw]) of
>>>         {ok, #file_info{type = regular}} ->
>>>             1;
>>>         _ -> %% not care about symlink for now, assume a directory
>>>             Children = filelib:wildcard(Path ++ "/*"),
>>>             lists:foldl(fun(P, N) -> N + walker(P) end, 0, Children)
>>> end.
>>>
>>>> timer:tc(fun() -> directory:walker("/usr/share") end).
>>>
>>> {1611688, <tel:1611688,28953>/28953 <tel:1611688,28953>/}
>>>
>>> I'm only counting number of files in this case.
>>>
>>> /Frank
>>>
>>> Le sam. 10 déc. 2016 à 10:05, Sergej Jurečko <[hidden email]
>>> <mailto:[hidden email]>> a écrit :
>>>
>>>     read_file_info does the job of is_dir and file_size in a single call.
>>> That was the intention.
>>>
>>>     Also use file:read_file_info(name,[raw])
>>>
>>>
>>>     Sergej
>>>
>>>>     On 10 Dec 2016, at 09:42, Benoit Chesneau <[hidden email]
>>>> <mailto:[hidden email]>> wrote:
>>>>
>>>>     this is kind of bullshit (sorry ;).... at the end this is what does
>>>> the helpers in filelib:
>>>>
>>>> https://github.com/erlang/otp/blob/maint/lib/stdlib/src/filelib.erl#L257
>>>>
>>>>     except if you have a better algorithm in mind i don't se the point of
>>>> rewriting something that is aleaready existing ...
>>>>
>>>>     On Sat, 10 Dec 2016 at 09:36, Sergej Jurečko
>>>> <[hidden email] <mailto:[hidden email]>> wrote:
>>>>
>>>>         Stop using filelib functions. Use file:read_file_info and
>>>> file:list_dir.
>>>>
>>>>         Sergej
>>>>
>>>>         On Dec 10, 2016 9:29 AM, "Frank Muller"
>>>> <[hidden email] <mailto:[hidden email]>> wrote:
>>>>
>>>>             Hi Stanislaw
>>>>
>>>>             First, I don't care if I've to use documented/undocumented
>>>> calls as long as I can achieve my goal: faster dir walking.
>>>>
>>>>             And you're right, here is a detailed comparison with other
>>>> scripting languages:
>>>>
>>>>             In my /usr/share, there’s:
>>>>             2580 directories
>>>>             28953 files
>>>>
>>>>             1. Erlang (no io:format/1, just recurse):
>>>>
>>>>             walk(Dir) ->
>>>>                 {ok, Files} = file:list_dir(Dir),
>>>>                 walk(Dir, Files).
>>>>
>>>>             walk(Dir, [ Basename | Rest ]) ->
>>>>                 Path = filename:join([ Dir, Basename ]),
>>>>                 case filelib:is_dir(Path) of
>>>>                     true  ->
>>>>                         walk(Path);
>>>>                     false ->
>>>>                       %%  io:format("~s~n", [Path]),
>>>>                         filelib:file_size(Path)
>>>>                 end,
>>>>                 walk(Dir, Rest);
>>>>             walk(_, []) ->
>>>>                 ok.
>>>>
>>>>             timer:tc(fun() -> directoy:walker("/usr/share") end).
>>>>             {4662361 <tel:4662361>,ok}
>>>>
>>>>             2. Python (this code even count the size of dir):
>>>>             From:
>>>> http://stackoverflow.com/questions/1392413/calculating-a-directory-size-using-python
>>>>
>>>>             import os
>>>>             def get_size(start_path = '.'):
>>>>                 total_size = 0
>>>>                 for dirpath, dirnames, filenames in os.walk(start_path):
>>>>                     for f in filenames:
>>>>                         fp = os.path.join(dirpath, f)
>>>>                         total_size += os.path.getsize(fp)
>>>>                 return total_size
>>>>
>>>>             print get_size()
>>>>
>>>>             $ cd /usr/share
>>>>             $ time dir_walker.py
>>>>             432034130 <tel:432034130>
>>>>             0.25 real         0.13 user         0.10 sys
>>>>
>>>>             2. Perl (same, count dir size)
>>>>             http://www.perlmonks.org/?node_id=168974
>>>>
>>>>             use File::Find;
>>>>             my $size = 0;
>>>>             find(sub { $size += -s if -f $_ }, "/usr/share");
>>>>
>>>>             $ time perl dir_walker.pl <http://dir_walker.pl/>
>>>>             432034130 <tel:432034130>
>>>>             0.13 real         0.05 user         0.08 sys
>>>>
>>>>             3. Ruby (same, count dir size):
>>>>
>>>>             def directory_size(path)
>>>>               path << '/' unless path.end_with?('/')
>>>>               raise RuntimeError, "#{path} is not a directory" unless
>>>> File.directory?(path)
>>>>               total_size = 0
>>>>               Dir["#{path}**/*"].each do |f|
>>>>                 total_size += File.size(f) if File.file?(f) &&
>>>> File.size?(f)
>>>>               end
>>>>               total_size
>>>>             end
>>>>             puts directory_size '/usr/share’
>>>>
>>>>             $ time walker.rb
>>>>             432028422 <tel:432028422>
>>>>
>>>>             0.21 real         0.09 user         0.11 sys
>>>>
>>>>             4. Lua:
>>>>             From: http://lua-users.org/wiki/DirTreeIterator
>>>>
>>>>             require "lfs"
>>>>
>>>>             function dirtree(dir)
>>>>               assert(dir and dir ~= "", "directory parameter is missing
>>>> or empty")
>>>>               if string.sub(dir, -1) == "/" then
>>>>                 dir=string.sub(dir, 1, -2)
>>>>               end
>>>>
>>>>               local function yieldtree(dir)
>>>>                 for entry in lfs.dir(dir) do
>>>>                   if entry ~= "." and entry ~= ".." then
>>>>                     entry=dir.."/"..entry
>>>>             local attr=lfs.attributes(entry)
>>>>             coroutine.yield(entry,attr)
>>>>             if attr.mode == "directory" then
>>>>               yieldtree(entry)
>>>>             end
>>>>                   end
>>>>                 end
>>>>               end
>>>>
>>>>               return coroutine.wrap(function() yieldtree(dir) end)
>>>>             end
>>>>
>>>>             for filename, attr in dirtree("/usr/share") do
>>>>                   print(attr.mode, filename)
>>>>             end
>>>>
>>>>             $ luarocks install luafilesystem
>>>>             $ time lua walker.lua > /dev/null
>>>>             0.30 real         0.16 user         0.14 sys
>>>>
>>>>             Do you need more?
>>>>
>>>>             Thanks for you help.
>>>>             /Frank
>>>>
>>>>             Le sam. 10 déc. 2016 à 00:51, Stanislaw Klekot
>>>> <[hidden email] <mailto:[hidden email]>> a écrit :
>>>>
>>>>                 On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller
>>>> wrote:
>>>>
>>>>                 > I would like to improve the speed of my directory
>>>> walker.
>>>>
>>>>                 >
>>>>
>>>>                 > walk(Dir) ->
>>>>
>>>>                 >     {ok, Files} = prim_file:list_dir(Dir),
>>>>
>>>>                 >     walk(Dir, Files).
>>>>
>>>>
>>>>
>>>>                 Why prim_file:list_dir() instead of file:list_dir()? The
>>>> former is
>>>>
>>>>                 undocumented internal function.
>>>>
>>>>
>>>>
>>>>                 [...]
>>>>
>>>>                 > Compared to almost anything i found on the web, it’s
>>>> still very slow:
>>>>
>>>>                 > > timer:tc(fun() -> dir:walk("/usr/share") end).
>>>>
>>>>                 > {4662361,ok}
>>>>
>>>>
>>>>
>>>>                 What is it this "anything you found on the web"? And how
>>>> did you run
>>>>
>>>>                 your comparisons? There's a large difference between
>>>> first and second
>>>>
>>>>                 consequent run caused by OS' directory cache, and there's
>>>> large
>>>>
>>>>                 difference between simply walking through the directory
>>>> and walking with
>>>>
>>>>                 printing something to the screen for every file.
>>>>
>>>>
>>>>
>>>>                 Then there's also your using filelib:is_dir() and then
>>>>
>>>>                 filelib:file_size(), which means two stat(2) calls, while
>>>> you only need
>>>>
>>>>                 to do it once per file (file:read_file_info()).
>>>>
>>>>
>>>>
>>>>                 --
>>>>
>>>>                 Stanislaw Klekot
>>>>
>>>>
>> _______________________________________________
>> erlang-questions mailing list
>> [hidden email]
>> http://erlang.org/mailman/listinfo/erlang-questions
> _______________________________________________
> erlang-questions mailing list
> [hidden email]
> http://erlang.org/mailman/listinfo/erlang-questions
>
_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Max Lapshin-2
When you just call file:list_dir,  you make gen_server:call to a singleton process.

When you start driver and make prim_file calls to it, you are working on your own thread.


Of course you must avoid lists if you do micro benchmarks:



list_dir(Port, Dir) ->
  {ok, Entries} = prim_file:list_dir(H, Dir),
  [list_dir(Port, <<Dir/binary, "/", (list_to_binary(E))/binary>>) || E <- Entries].

and of course, there should not be any magical performance.

Tests on your laptop are usually void and useless.  Take loaded server and make tests on it.  How your software will work when HDD is responding during 30 seconds.



_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Mikael Pettersson-5
In reply to this post by Stanislaw Klekot
Stanislaw Klekot writes:
 > On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:
 > > I would like to improve the speed of my directory walker.
 > >
 > > walk(Dir) ->
 > >     {ok, Files} = prim_file:list_dir(Dir),
 > >     walk(Dir, Files).
 >
 > Why prim_file:list_dir() instead of file:list_dir()? The former is
 > undocumented internal function.

list_dir can be a very time-consuming operation, and in those cases
using file:list_dir would block the single file server for everything
else.  We routinely use prim_file:list_dir to reduce the negative
effects of accessing large directories.
_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
Reply | Threaded
Open this post in threaded view
|

Re: Fast directory walker

Frank Muller
All in one file:

-module(dir).

-include_lib("kernel/include/file.hrl").

-export([ walker/1,
         walker1/1,
         walker2/1 ]).

%% original
walker(Path) ->
   %% io:format("~s~n", [Path]),
   case file:read_file_info(Path) of
       {ok, #file_info{type = regular}} ->
           1;
       _ -> %% not care about symlink for nor, assume a directory
           Children = filelib:wildcard(Path ++ "/*"),
           lists:foldl(fun(P, N) -> N + walker(P) end, 0, Children)
   end.

%% Sergej version + [raw] option
walker1(Path) ->
   case prim_file:list_dir(Path) of
       {ok,L} ->
           walker1(Path,L,0);
       _ ->
           0
   end.
walker1(Pth,["."|T],Sz) ->
   walker1(Pth,T,Sz);
walker1(Pth,[".."|T],Sz) ->
   walker1(Pth,T,Sz);
walker1(Pth,[H|T],Sz) ->
   Nm = Pth++"/"++H,
   case prim_file:read_file_info(Nm, [raw]) of
       {ok,#file_info{type = regular, size = FS}} ->
           walker1(Pth,T,Sz+FS);
       {ok,#file_info{type = directory}} ->
       case prim_file:list_dir(Nm) of
               {ok,L} ->
                   walker1(Pth, T, walker1(Nm,L,Sz));
               _ ->
                   walker1(Pth, T, Sz)
           end;
       _ ->
           walker1(Pth,T,Sz)
   end;
walker1(_,[],Sz) ->
   Sz.



%% Sergej version + Max's hint
walker2(Path) ->
   {ok, Port} = prim_file:start(),
   case prim_file:list_dir(Port, Path) of
       {ok,L} ->
           walker2(Port,Path,L,0);
       _ ->
           0
   end.

walker2(Port,Pth,["."|T],Sz) ->
   walker2(Port,Pth,T,Sz);
walker2(Port,Pth,[".."|T],Sz) ->
   walker2(Port,Pth,T,Sz);
walker2(Port,Pth,[H|T],Sz) ->
   Nm = Pth++"/"++H,
   case prim_file:read_file_info(Nm, [raw]) of
       {ok,#file_info{type = regular, size = FS}} ->
           walker2(Port,Pth,T,Sz+FS);
       {ok,#file_info{type = directory}} ->
           case prim_file:list_dir(Port,Nm) of
               {ok,L} ->
                   walker2(Port,Pth,T,walker2(Port,Nm,L,Sz));
               _ ->
                   walker2(Port,Pth, T, Sz)
           end;
       _ ->
           walker2(Port,Pth,T,Sz)
   end;
walker2(_,_,[],Sz) ->
   Sz.


1> timer:tc(fun() -> dir:walker("/usr/share") end).
{<a href="tel:1538933,28941" dir="ltr">1538933,28941}
2> timer:tc(fun() -> dir:walker1("/usr/share") end).
{<a href="tel:1492408,447632520" dir="ltr">1492408,447632520}
3> timer:tc(fun() -> dir:walker2("/usr/share") end).
{<a href="tel:1477578,447632520" dir="ltr">1477578,447632520}

Getting close to 1sec. Any other ideas for improvement?

/Frank

Le sam. 10 déc. 2016 à 15:30, Mikael Pettersson <[hidden email]> a écrit :
Stanislaw Klekot writes:

 > On Fri, Dec 09, 2016 at 11:15:58PM +0000, Frank Muller wrote:

 > > I would like to improve the speed of my directory walker.

 > >

 > > walk(Dir) ->

 > >     {ok, Files} = prim_file:list_dir(Dir),

 > >     walk(Dir, Files).

 >

 > Why prim_file:list_dir() instead of file:list_dir()? The former is

 > undocumented internal function.



list_dir can be a very time-consuming operation, and in those cases

using file:list_dir would block the single file server for everything

else.  We routinely use prim_file:list_dir to reduce the negative

effects of accessing large directories.


_______________________________________________
erlang-questions mailing list
[hidden email]
http://erlang.org/mailman/listinfo/erlang-questions
12