This is the mail archive of the
glibc-bugs@sources.redhat.com
mailing list for the glibc project.
[Bug regex/544] New: Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec down a lot
- From: "jakub at redhat dot com" <sourceware-bugzilla at sources dot redhat dot com>
- To: glibc-bugs at sources dot redhat dot com
- Date: 12 Nov 2004 16:39:02 -0000
- Subject: [Bug regex/544] New: Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec down a lot
- Reply-to: sourceware-bugzilla at sources dot redhat dot com
#include <fcntl.h>
#include <locale.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <unistd.h>
static int
do_test (void)
{
static const char *pat[] = {
".?.?.?.?.?.?.?abcde",
"((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
"((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
"((((((((((.?))))))))))abcde" };
int fd = open ("../ChangeLog.14", O_RDONLY);
if (fd < 0)
{
printf ("Couldn't open ChangeLog.14: %m\n");
return 1;
}
struct stat64 st;
if (fstat64 (fd, &st) < 0)
{
printf ("Couldn't fstat ChangeLog.14: %m\n");
return 1;
}
char *buf = malloc (st.st_size + 1);
if (buf == NULL)
{
printf ("Couldn't allocate buffer: %m\n");
return 1;
}
if (read (fd, buf, st.st_size) != (ssize_t) st.st_size)
{
puts ("Couldn't read ChangeLog.14");
return 1;
}
close (fd);
buf[st.st_size] = '\0';
setlocale (LC_ALL, "de_DE.UTF-8");
for (int i = 0; i < sizeof (pat) / sizeof (pat[0]); ++i)
{
printf ("pattern %s", pat[i]);
regex_t rbuf;
int err = regcomp (&rbuf, pat[i], REG_EXTENDED | REG_NOSUB);
if (err != 0)
{
putchar ('\n');
char errstr[300];
regerror (err, &rbuf, errstr, sizeof (errstr));
puts (errstr);
return err;
}
struct timeval start, stop;
gettimeofday (&start, NULL);
err = regexec (&rbuf, buf, 0, NULL, 0);
if (err != REG_NOMATCH)
{
puts ("\nregexec unexpectedly matched");
return 1;
}
gettimeofday (&stop, NULL);
stop.tv_sec -= start.tv_sec;
if (stop.tv_usec < start.tv_usec)
{
stop.tv_sec--;
stop.tv_usec += 1000000 - start.tv_usec;
}
else
stop.tv_usec -= start.tv_usec;
printf (": %ld.%06lds\n", (long) stop.tv_sec, (long) stop.tv_usec);
regfree (&rbuf);
}
return 0;
}
#define TIMEOUT 10
#define TEST_FUNCTION do_test ()
#include "../test-skeleton.c"
There is no reason why the second regexec should be any slower than the first
one, yet on my box the second regexec is about 6 times slower than the first one.
I'll look into what can be done. I hope regcomp can kill those nodes from being
seen by regexec if they aren't needed for backreferences and REG_NOSUB, or if
they are nested with no intervening tokens in between (e.g. '((x))') and we can
tell the final match computation that say match 0 is identical to match 1.
--
Summary: Even unneeded OP_{OPEN,CLOSE}_SUBEXP nodes slow regexec
down a lot
Product: glibc
Version: unspecified
Status: NEW
Severity: normal
Priority: P2
Component: regex
AssignedTo: gotom at debian dot or dot jp
ReportedBy: jakub at redhat dot com
CC: glibc-bugs-regex at sources dot redhat dot com,glibc-
bugs at sources dot redhat dot com
http://sources.redhat.com/bugzilla/show_bug.cgi?id=544
------- You are receiving this mail because: -------
You are on the CC list for the bug, or are watching someone who is.