This is the mail archive of the glibc-bugs@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug regex/544] New: Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec down a lot


#include <fcntl.h>
#include <locale.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <unistd.h>

static int
do_test (void)
{
  static const char *pat[] = {
    ".?.?.?.?.?.?.?abcde",
    "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
    "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
    "((((((((((.?))))))))))abcde" };

  int fd = open ("../ChangeLog.14", O_RDONLY);
  if (fd < 0)
    {
      printf ("Couldn't open ChangeLog.14: %m\n");
      return 1;
    }

  struct stat64 st;
  if (fstat64 (fd, &st) < 0)
    {
      printf ("Couldn't fstat ChangeLog.14: %m\n");
      return 1;
    }

  char *buf = malloc (st.st_size + 1);
  if (buf == NULL)
    {
      printf ("Couldn't allocate buffer: %m\n");
      return 1;
    }

  if (read (fd, buf, st.st_size) != (ssize_t) st.st_size)
    {
      puts ("Couldn't read ChangeLog.14");
      return 1;
    }

  close (fd);
  buf[st.st_size] = '\0';


  setlocale (LC_ALL, "de_DE.UTF-8");

  for (int i = 0; i < sizeof (pat) / sizeof (pat[0]); ++i)
    {
      printf ("pattern %s", pat[i]);

      regex_t rbuf;
      int err = regcomp (&rbuf, pat[i], REG_EXTENDED | REG_NOSUB);
      if (err != 0)
        {
          putchar ('\n');
          char errstr[300];
          regerror (err, &rbuf, errstr, sizeof (errstr));
          puts (errstr);
          return err;
        }

      struct timeval start, stop;
      gettimeofday (&start, NULL);

      err = regexec (&rbuf, buf, 0, NULL, 0);
      if (err != REG_NOMATCH)
        {
          puts ("\nregexec unexpectedly matched");
          return 1;
        }

      gettimeofday (&stop, NULL);
      stop.tv_sec -= start.tv_sec;
      if (stop.tv_usec < start.tv_usec)
        {
          stop.tv_sec--;
          stop.tv_usec += 1000000 - start.tv_usec;
        }
      else
        stop.tv_usec -= start.tv_usec;
      printf (": %ld.%06lds\n", (long) stop.tv_sec, (long) stop.tv_usec);

      regfree (&rbuf);
    }

  return 0;
}

#define TIMEOUT 10
#define TEST_FUNCTION do_test ()
#include "../test-skeleton.c"

There is no reason why the second regexec should be any slower than the first
one, yet on my box the second regexec is about 6 times slower than the first one.

I'll look into what can be done.  I hope regcomp can kill those nodes from being
seen by regexec if they aren't needed for backreferences and REG_NOSUB, or if
they are nested with no intervening tokens in between (e.g. '((x))') and we can
tell the final match computation that say match 0 is identical to match 1.

-- 
           Summary: Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec
                    down a lot
           Product: glibc
           Version: unspecified
            Status: NEW
          Severity: normal
          Priority: P2
         Component: regex
        AssignedTo: gotom at debian dot or dot jp
        ReportedBy: jakub at redhat dot com
                CC: glibc-bugs-regex at sources dot redhat dot com,glibc-
                    bugs at sources dot redhat dot com


http://sources.redhat.com/bugzilla/show_bug.cgi?id=544

------- You are receiving this mail because: -------
You are on the CC list for the bug, or are watching someone who is.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]