readdir() returns inaccessible name if file was created with invalid UTF-8

Christian Franke Christian.Franke@t-online.de
Sat Jun 28 10:18:57 GMT 2025


Corinna Vinschen via Cygwin wrote:
> On Jun 27 15:32, Christian Franke via Cygwin wrote:
>> $ touch $'t-\xef\x80\x80'
>> The name mapping is:
>> "t-\xEF\x80\x80" -(open, ...)-> L"t-\xDB59" -(readdir)-> "t-"
> Did you copy/paste this from the old mail, by any chance?

Sorry, I accidentally mixed two cases with same readdir() result:

"t-\xEF\x80\x80" -(open, ...)-> L"t-\xF000" -(readdir)-> "t-"
"t-\xED\xAD\x99' -(open, ...)-> L"t-\xDB59" -(readdir)-> "t-"

$ touch $'t-\xed\xad\x99'
$ touch $'t-\xef\x80\x80'
$ ls | uniq -c
       2 t-

Does no longer occur in 3.7.0-0.165.g1b60f4861b70 but see below.


> Using the latest test DLL the mapping is
>
>    "t-\xEF\x80\x80" -(open, ...)-> L"t-\xF000"
>
> And that's basically correct, albeit it leads to problems.
>
> You know that we defined the area from 0xf000 to 0xf0ff as our private
> use area to create filenames with characters invalid in DOS filenames
> by transposing these chars into the private use area.  When converting
> the filenames back, the 0xf0XX chars are transposed back to 0xXX.

Yes.


> But yeah, I found the bug here.  The problem is that the transpose table
> incorrectly contains NUL as transposable character.  So if you create
> L"t-\xF000", that's fine.  However, when converting this name back to
> UTF-8, the filename becomes L"t-\0".  Oops.
>
> I dropped the ASCII NUL from the list of transposable characters and
> now what you get is this:
>
>    $ touch $'t-\xef\x80\x80'
>    $ touch $'t-\xef\x80\x81'
>    $ ls -l
>    total 0
>    -rw-r--r-- 1 corinna vinschen 0 Jun 27 16:49 't-'$'\001'
>    -rw-r--r-- 1 corinna vinschen 0 Jun 27 16:49 't-'$'\357\200\200'
>
> Apart from the incorrect transposition of ASCII NUL, the transposition
> works transparently:
>
>    $ echo foo > $'t-\xef\x80\x81'
>    $ cat $'t-\xef\x80\x81'
>    foo
>    $ cat $'t-\x01'
>    foo
>
> I'll apply the patch shortly.

$ touch $'t-\xed\xad\x90'
$ touch $'t-\xed\xad\x91'
$ touch $'t-\xed\xad\x92'
$ touch $'t-\xed\xad\x93'
$ touch $'t-\xed\xad\x94'
$ ls | uniq -c
       5 t-

$ ls -s
ls: cannot access 't-': No such file or directory
ls: cannot access 't-': No such file or directory
ls: cannot access 't-': No such file or directory
ls: cannot access 't-': No such file or directory
ls: cannot access 't-': No such file or directory
total 0
? t-  ? t-  ? t-  ? t-  ? t-

All results found by several runs with different seeds of the attached 
test program have in common that the Windows path name contains an 
invalid word in UTF-16 High Surrogate range:

$ ./randnames 42
$'t-\xEC\x9E\xB3\xEF\x82\x80\xEF\x83\xA0': access() failed, errno=2:
$'t-\xED\xA4\xA8\x80\xE0': original path
L"t-\xD928\xF080\xF0E0": Windows path

$'t-\xEE\x9E\xB3\xEF\x83\xA1': access() failed, errno=2:
$'t-\xED\xA6\xB0\xE1': original path
L"t-\xD9B0\xF0E1": Windows path
...
$'t-\xE7\xBE\xB3\xEF\x82\xB3': access() failed, errno=2:
$'t-\xED\xA2\x96\xB3': original path
L"t-\xD896\xF0B3": Windows path


-- 
Thanks,
Christian


-------------- next part --------------
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <wchar.h>
#include <windows.h>

static void print_c(FILE * f, const char * s)
{
  fputs("$'", f);
  char c;
  for (int i = 0; (c = s[i]); i++) {
    if (c == '\'')
      fputs("'\\'$'", f);
    else if (' ' <= c && c <= '~')
      fputc(c, f);
    else
      fprintf(f, "\\x%02X", c & 0xff);
  }
  fputc('\'', f);
}

static void print_w(FILE * f, const wchar_t * s)
{
  fputs("L\"", f);
  wchar_t c;
  for (int i = 0; (c = s[i]); i++) {
    if (c == L'"' || c == L'\\')
      fprintf(f, "\\%c", c);
    else if (L' ' <= c && c <= L'~')
      fputc(c, f);
    else
      fprintf(f, "\\x%04X", c & 0xffff);
  }
  fputc('"', f);
}

static void get_winname(wchar_t * name)
{
  WIN32_FIND_DATAW e;
  HANDLE h = FindFirstFileW(L"*", &e);
  if (h == INVALID_HANDLE_VALUE) {
    fprintf(stderr, "FindFirstFileW(): Error=%u\n", GetLastError());
    exit(1);
  }
  int i = 0;
  do {
    if (!wcscmp(e.cFileName, L".") || !wcscmp(e.cFileName, L".."))
      continue;
    wcscpy(name, e.cFileName);
    i++;
  } while (FindNextFileW(h, &e));
  FindClose(h);
  if (i != 1) {
    fprintf(stderr, "Error: %d Win32 files found\n", i);
    exit(1);
  }
}

static void get_cygname(char * name)
{
  DIR * d = opendir("."); 
  if (!d) {
    perror("opendir");
    exit(1);
  }
  int i = 0;
  const struct dirent * e;
  while ((e = readdir(d))) {
    if (!strcmp(e->d_name, ".") || !strcmp(e->d_name, ".."))
      continue;
    strcpy(name, e->d_name);
    i++;
  }
  closedir(d);
  if (i != 1) {
    fprintf(stderr, "Error: %d Cygwin files found\n", i);
    exit(1);
  }
}

static void randname(char * name, int maxlen)
{
  int len = 1 + rand() % (maxlen + 1 - 1);
  for (int i = 0; i < len; i++) {
    char c = 1 + rand() % (256 - 2 - 1);
    if (c >= '/')
      c++;
    if (c >= '\\')
      c++;
    name[i] = c;
  }
  name[len] = 0;
}

static int testname(const char * name)
{
  int fd = open(name, O_WRONLY|O_CREAT, 0644);
  if (fd < 0) {
    print_c(stdout, name); printf(": open() failed, errno=%d\n", errno);
    exit(1);
  }
  close(fd);

  char cygname[MAX_PATH];
  get_cygname(cygname);
  wchar_t winname[MAX_PATH];
  get_winname(winname);

  int rc = 1;
  if (access(cygname, 0)) {
    print_c(stdout, cygname); printf(": access() failed, errno=%d:\n", errno);
    print_c(stdout, name); printf(": original path\n"); 
    print_w(stdout, winname); printf(": Windows path\n\n");
    rc = 0;
  }

  if (unlink(name)) {
    print_c(stdout, name); printf(": unlink() failed, errno=%d\n", errno);
    print_w(stdout, winname); printf(": Windows path\n");
    exit(1);
  }
  return rc;
}

int main(int argc, char **argv)
{
  if (argc > 1)
    srand(atoi(argv[1]));

  const char * dir = "test.tmp";
  rmdir(dir);
  if (mkdir(dir, 0755)) {
    perror(dir); return 1;
  }
  if (chdir(dir)) {
    perror(dir); return 1;
  }

  int errs = 0;
  for (int i = 0; i < 100000; i++) {
    char name[8] = "t-";
    randname(name + 2, sizeof(name) - 1 - 2);
    if (!testname(name) && ++errs >= 10)
      break;
  }
  return 0;
}


More information about the Cygwin mailing list