This is the mail archive of the cygwin mailing list for the Cygwin project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: "du -b --files0-from=-" running out of memory


Jim Meyering <jim@meyering.net> wrote:
> Eric Blake <ebb9@byu.net> wrote:
>> According to Barry Kelly on 11/23/2008 6:24 AM:
>>> I have a problem with du running out of memory.
>>>
>>> I'm feeding it a list of null-separated file names via standard input,
>>> to a command-line that looks like:
>>>
>>>   du -b --files0-from=-
>
> du's --files0-from option reads the entire list of
> file names into memory before processing the first name.

[By the way, this flaw affects wc and sort, too, since
 they have the same --files0-from option and use nearly
 identical code.  Hence the fixes you see below for du.c will
 soon be applied to those other programs.  ]

Here's how the fixed du performs,
operating on a directory produced like this:

  mkdir du-test && cd du-test
  seq --format=%032g 60000|xargs touch

I ran du like this:

  for f in new old; do
    find . -type f -print0|valgrind --tool=massif \
      --massif-out-file=m.$f -- du.$f -a --files0-from=- > out.$f
  done

The difference is striking:
  old peak usage 21MB
  new peak usage just 12 *KB*




Command:            du.old -a --files0-from=-
Massif arguments:   --massif-out-file=m.old
ms_print arguments: m.old
--------------------------------------------------------------------------------


    MB
21.21^                                       #
     |                                       #: .
     |                                       #: :.
     |                                       #: :::
     |                                       #: ::: : .
     |                                       #: ::: : : .
     |                                      .#: ::: : : : .
     |                                      :#: ::: : : : : .
     |                                      :#: ::: : : : : : .
     |                                      :#: ::: : : : : : : .
     |                                      :#: ::: : : : : : : :.
     |                                      :#: ::: : : : : : : :: .
     |                                    : :#: ::: : : : : : : :: : .
     |                                    : :#: ::: : : : : : : :: : : .
     |                                    : :#: ::: : : : : : : :: : : :::.
     |                                    : :#: ::: : : : : : : :: : : ::::.
     |                              ,   @.: :#: ::: : : : : : : :: : : ::::::,
     |                        .., .:@:: @:: :#: ::: : : : : : : :: : : ::::::@
     |               . ..:::: ::@ ::@:: @:: :#: ::: : : : : : : :: : : ::::::@
     |     ..,.@:: : : :::::: ::@ ::@:: @:: :#: ::: : : : : : : :: : : ::::::@
   0 +----------------------------------------------------------------------->Mi
     0                                                                   266.3

Number of snapshots: 55
 Detailed snapshots: [3, 6, 8, 21, 24, 27, 31 (peak), 54]









Command:            du.new -a --files0-from=-
Massif arguments:   --massif-out-file=m.new
ms_print arguments: m.new
--------------------------------------------------------------------------------


    KB
12.88^,   .      :                                      :.       :.        :
     |# : :    :::.:.   ::::::@::@::@:::::::::::::.     ::@:     :::      .::.
     |# : :    :::::::  ::::::@::@::@::::::::::::::   ::::@:  .:@:::   ..:::::
     |# : :    :::::::  ::::::@::@::@::::::::::::::   ::::@:  ::@:::   :::::::
     |# : :    :::::::  ::::::@::@::@::::::::::::::   ::::@:  ::@:::   :::::::
     |# : :    :::::::  ::::::@::@::@::::::::::::::   ::::@:  ::@:::   :::::::
     |# : :    :::::::  ::::::@::@::@::::::::::::::   ::::@:  ::@:::   :::::::
     |# : :    :::::::  ::::::@::@::@::::::::::::::   ::::@:  ::@:::   :::::::
     |# : :    ::::::: :::::::@::@::@::::::::::::::   ::::@:  ::@:::   :::::@:
     |# : ::.:.::::::: :::::::@::@::@::::::::::::::.,.::::@:..::@:::..,:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
     |# : :::::::::::: :::::::@::@::@:::::::::::::::@:::::@:::::@:::::@:::::@:
   0 +----------------------------------------------------------------------->Mi
     0                                                                   231.5

Number of snapshots: 97
 Detailed snapshots: [1 (peak), 26, 29, 34, 54, 64, 74, 84, 94]


Here's the patch.
I'll probably push it tomorrow.

>From 318543c4478d1b7833c9ff6b46974cd262bcf5d4 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Mon, 24 Nov 2008 14:11:15 +0100
Subject: [PATCH 1/2] argv-iter: new module

* gl/lib/argv-iter.h: New file.
* gl/lib/argv-iter.c: New file.
* gl/modules/argv-iter: New file.
---
 gl/lib/argv-iter.c   |  113 ++++++++++++++++++++++++++++++++++++++++++++++++++
 gl/lib/argv-iter.h   |   47 +++++++++++++++++++++
 gl/modules/argv-iter |   24 +++++++++++
 3 files changed, 184 insertions(+), 0 deletions(-)
 create mode 100644 gl/lib/argv-iter.c
 create mode 100644 gl/lib/argv-iter.h
 create mode 100644 gl/modules/argv-iter

diff --git a/gl/lib/argv-iter.c b/gl/lib/argv-iter.c
new file mode 100644
index 0000000..204769a
--- /dev/null
+++ b/gl/lib/argv-iter.c
@@ -0,0 +1,113 @@
+/* Iterate over arguments from argv or --files0-from=FILE
+   Copyright (C) 2008 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Jim Meyering.  */
+
+#include <config.h>
+#include "argv-iter.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+struct argv_iterator
+{
+  /* Test FP to determine whether in read-mode or argv-mode. */
+  /* file-mode: fp records position */
+  FILE *fp;
+  size_t item_idx;
+
+  /* argv-mode: record just argv and current pointer */
+  char **arg_list;
+  char **p;
+};
+
+struct argv_iterator *
+argv_iter_init_argv (char **argv)
+{
+  struct argv_iterator *ai = malloc (sizeof *ai);
+  if (!ai)
+    return NULL;
+  ai->fp = NULL;
+  ai->arg_list = argv;
+  ai->p = argv;
+  return ai;
+}
+
+/* Initialize to read from the stream, FP.
+   The input is expected to contain a list of NUL-delimited tokens.  */
+struct argv_iterator *
+argv_iter_init_stream (FILE *fp)
+{
+  struct argv_iterator *ai = malloc (sizeof *ai);
+  if (!ai)
+    return NULL;
+  ai->fp = fp;
+  ai->item_idx = 0;
+  ai->arg_list = NULL;
+  return ai;
+}
+
+char *
+argv_iter (struct argv_iterator *ai, enum argv_iter_err *err)
+{
+  if (ai->fp)
+    {
+      char *name = NULL;
+      size_t buf_len = 0;
+      ssize_t len = getdelim (&name, &buf_len, '\0', ai->fp);
+      if (len < 0)
+        {
+          free (name);
+          *err = feof (ai->fp) ? AI_ERR_EOF : AI_ERR_READ;
+          return NULL;
+        }
+
+      *err = AI_ERR_OK;
+      ai->item_idx++;
+      return name;
+    }
+  else
+    {
+      if (*(ai->p) == NULL)
+        {
+          *err = AI_ERR_EOF;
+          return NULL;
+        }
+      else
+        {
+          *err = AI_ERR_OK;
+          return strdup (*(ai->p++));
+        }
+    }
+}
+
+size_t
+argv_iter_n_args (struct argv_iterator const *ai)
+{
+  return ai->fp ? ai->item_idx : ai->p - ai->arg_list + 1;
+}
+
+void
+argv_iter_free (struct argv_iterator *ai)
+{
+  free (ai);
+}
+
+/*
+ * Local variables:
+ *  indent-tabs-mode: nil
+ * End:
+ */
diff --git a/gl/lib/argv-iter.h b/gl/lib/argv-iter.h
new file mode 100644
index 0000000..06a397c
--- /dev/null
+++ b/gl/lib/argv-iter.h
@@ -0,0 +1,47 @@
+/* Iterate over arguments from argv or --files0-from=FILE
+   Copyright (C) 2008 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdbool.h>
+
+struct argv_iterator;
+enum argv_iter_err;
+
+#undef _ATTRIBUTE_NONNULL_
+#if __GNUC__ == 3 && __GNUC_MINOR__ >= 3 || 3 < __GNUC__
+# define _ATTRIBUTE_NONNULL_(m,...) __attribute__ ((__nonnull__ (m)))
+#else
+# define _ATTRIBUTE_NONNULL_(m,...)
+#endif
+
+enum argv_iter_err
+{
+  AI_ERR_OK = 1,
+  AI_ERR_EOF,
+  AI_ERR_MEM,
+  AI_ERR_READ
+};
+
+struct argv_iterator *argv_iter_init_argv (char **argv)
+  _ATTRIBUTE_NONNULL_ (1);
+struct argv_iterator *argv_iter_init_stream (FILE *fp)
+  _ATTRIBUTE_NONNULL_ (1);
+char *argv_iter (struct argv_iterator *, enum argv_iter_err *)
+  _ATTRIBUTE_NONNULL_ (1, 2);
+size_t argv_iter_n_args (struct argv_iterator const *)
+  _ATTRIBUTE_NONNULL_ (1);
+void argv_iter_free (struct argv_iterator *)
+  _ATTRIBUTE_NONNULL_ (1);
diff --git a/gl/modules/argv-iter b/gl/modules/argv-iter
new file mode 100644
index 0000000..e9fc633
--- /dev/null
+++ b/gl/modules/argv-iter
@@ -0,0 +1,24 @@
+Description:
+iterate through argv or a --files0-from=-specified file
+
+Files:
+lib/argv-iter.c
+lib/argv-iter.h
+
+Depends-on:
+getdelim
+stdbool
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += argv-iter.c argv-iter.h
+
+Include:
+"argv-iter.h"
+
+License
+GPL
+
+Maintainer:
+Jim Meyering
--
1.6.0.4.1044.g77718


>From 8b274f247f535d99022c7316e3cab121d4c946e9 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Mon, 24 Nov 2008 14:13:43 +0100
Subject: [PATCH 2/2] du: read and process --files0-from= input a name at a time,

rather than by reading the entire input into RAM and *then*
processing each file name.
* src/du.c: Include "argv-iter.h", not "readtokens0.h".
(main): Rewrite to use argv-iter.
Call xfts_open on each argument, rather than on the entire
argv list at once.
Diagnose read failure.
* bootstrap.conf (gnulib_modules): Add argv-iter.
* NEWS (Bug fixes): Mention it.
---
 NEWS           |    3 +
 bootstrap.conf |    4 +-
 src/du.c       |  137 +++++++++++++++++++++++++++++++-------------------------
 3 files changed, 82 insertions(+), 62 deletions(-)

diff --git a/NEWS b/NEWS
index 360cb4b..f119daf 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,9 @@ GNU coreutils NEWS                                    -*- outline -*-

   cp uses much less memory in some situations

+  du --files0-from=FILE no longer reads all of FILE into RAM before
+  processing the first file name
+
   seq 9223372036854775807 9223372036854775808 now prints only two numbers
   on systems with extended long double support and good library support.
   Even with this patch, on some systems, it still produces invalid output,
diff --git a/bootstrap.conf b/bootstrap.conf
index aae307c..a3687d0 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -34,7 +34,9 @@ obsolete_gnulib_modules='
 gnulib_modules="
 	$avoided_gnulib_modules
 	$obsolete_gnulib_modules
-	acl alloca announce-gen argmatch assert
+	acl alloca announce-gen argmatch
+	argv-iter
+	assert
 	autobuild
 	backupfile base64
 	c-strcase c-strtod
diff --git a/src/du.c b/src/du.c
index 8b0894d..5ed2b12 100644
--- a/src/du.c
+++ b/src/du.c
@@ -30,6 +30,7 @@
 #include <assert.h>
 #include "system.h"
 #include "argmatch.h"
+#include "argv-iter.h"
 #include "error.h"
 #include "exclude.h"
 #include "fprintftime.h"
@@ -37,7 +38,6 @@
 #include "human.h"
 #include "quote.h"
 #include "quotearg.h"
-#include "readtokens0.h"
 #include "same.h"
 #include "stat-time.h"
 #include "xfts.h"
@@ -656,10 +656,8 @@ main (int argc, char **argv)
 {
   char *cwd_only[2];
   bool max_depth_specified = false;
-  char **files;
   bool ok = true;
   char *files_from = NULL;
-  struct Tokens tok;

   /* Bit flags that control how fts works.  */
   int bit_flags = FTS_TIGHT_CYCLE_CHECK;
@@ -922,6 +920,7 @@ main (int argc, char **argv)
         }
     }

+  struct argv_iterator *ai;
   if (files_from)
     {
       /* When using --files0-from=F, you may not specify any files
@@ -938,82 +937,98 @@ main (int argc, char **argv)
 	error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
 	       quote (files_from));

-      readtokens0_init (&tok);
-
-      if (! readtokens0 (stdin, &tok) || fclose (stdin) != 0)
-	error (EXIT_FAILURE, 0, _("cannot read file names from %s"),
-	       quote (files_from));
-
-      files = tok.tok;
+      ai = argv_iter_init_stream (stdin);
     }
   else
     {
-      files = (optind < argc ? argv + optind : cwd_only);
+      char **files = (optind < argc ? argv + optind : cwd_only);
+      ai = argv_iter_init_argv (files);
     }

+  if (!ai)
+    xalloc_die ();
+
   /* Initialize the hash structure for inode numbers.  */
   hash_init ();

-  /* Report and filter out any empty file names before invoking fts.
-     This works around a glitch in fts, which fails immediately
-     (without looking at the other file names) when given an empty
-     file name.  */
-  {
-    size_t i = 0;
-    size_t j;
+  bit_flags |= symlink_deref_bits;
+  static char *temp_argv[] = { NULL, NULL };

-    for (j = 0; ; j++)
-      {
-	if (i != j)
-	  files[i] = files[j];
+  while (true)
+    {
+      bool skip_file = false;
+      enum argv_iter_err ai_err;
+      char *file_name = argv_iter (ai, &ai_err);
+      if (ai_err == AI_ERR_EOF)
+	break;
+      if (!file_name)
+	{
+	  switch (ai_err)
+	    {
+	    case AI_ERR_READ:
+	      error (0, errno, _("%s: read error"), quote (files_from));
+	      skip_file = true;
+	      goto next_file_name;

-	if ( ! files[i])
-	  break;
+	    case AI_ERR_MEM:
+	      xalloc_die ();
+	    }
+	  assert (!"unexpected error code from argv_iter");
+	}
+      if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-"))
+	{
+	  /* Give a better diagnostic in an unusual case:
+	     printf - | du --files0-from=- */
+	  error (0, 0, _("when reading file names from stdin, "
+			 "no file name of %s allowed"),
+		 quote (file_name));
+	  skip_file = true;
+	}

-	if (files_from && STREQ (files_from, "-") && STREQ (files[i], "-"))
-	  {
-	    /* Give a better diagnostic in an unusual case:
-	       printf - | du --files0-from=- */
-	    error (0, 0, _("when reading file names from stdin, "
-			   "no file name of %s allowed"),
-		   quote ("-"));
-	    continue;
-	  }
+      /* Report and skip any empty file names before invoking fts.
+	 This works around a glitch in fts, which fails immediately
+	 (without looking at the other file names) when given an empty
+	 file name.  */
+      if (!file_name[0])
+	{
+	  /* Diagnose a zero-length file name.  When it's one
+	     among many, knowing the record number may help.
+	     FIXME: currently print the record number only with
+	     --files0-from=FILE.  Maybe do it for argv, too?  */
+	  if (files_from == NULL)
+	    error (0, 0, "%s", _("invalid zero-length file name"));
+	  else
+	    {
+	      /* Using the standard `filename:line-number:' prefix here is
+		 not totally appropriate, since NUL is the separator, not NL,
+		 but it might be better than nothing.  */
+	      unsigned long int file_number = argv_iter_n_args (ai);
+	      error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
+		     file_number, _("invalid zero-length file name"));
+	    }
+	  skip_file = true;
+	}

-	if (files[i][0])
-	  i++;
-	else
-	  {
-	    /* Diagnose a zero-length file name.  When it's one
-	       among many, knowing the record number may help.  */
-	    if (files_from)
-	      {
-		/* Using the standard `filename:line-number:' prefix here is
-		   not totally appropriate, since NUL is the separator, not NL,
-		   but it might be better than nothing.  */
-		unsigned long int file_number = j + 1;
-		error (0, 0, "%s:%lu: %s", quotearg_colon (files_from),
-		       file_number, _("invalid zero-length file name"));
-	      }
-	    else
-	      error (0, 0, "%s", _("invalid zero-length file name"));
-	  }
-      }
+      if (skip_file)
+	ok = false;
+      else
+	{
+	  temp_argv[0] = file_name;
+	  ok &= du_files (temp_argv, bit_flags);
+	}

-    ok = (i == j);
-  }
+    next_file_name:;
+      free (file_name);
+    }

-  bit_flags |= symlink_deref_bits;
-  ok &= du_files (files, bit_flags);
+  argv_iter_free (ai);
+
+  if (files_from && (ferror (stdin) || fclose (stdin) != 0))
+    error (EXIT_FAILURE, 0, _("error reading %s"), quote (files_from));

   if (ok && print_grand_total)
     print_size (&tot_dui, _("total"));

-  /* This isn't really necessary, but it does ensure we
-     exercise this function.  */
-  if (files_from)
-    readtokens0_free (&tok);
-
   hash_free (htab);

   exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);
--
1.6.0.4.1044.g77718

--
Unsubscribe info:      http://cygwin.com/ml/#unsubscribe-simple
Problem reports:       http://cygwin.com/problems.html
Documentation:         http://cygwin.com/docs.html
FAQ:                   http://cygwin.com/faq/


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]