readdir() returns inaccessible name if file was created with invalid UTF-8
Christian Franke
Christian.Franke@t-online.de
Thu Jun 26 17:07:05 GMT 2025
Corinna Vinschen via Cygwin wrote:
> On Jun 25 16:59, Christian Franke via Cygwin wrote:
>> On Sun, 15 Sep 2024 19:47:11 +0200, Christian Franke wrote:
>>> If a file name contains an invalid (truncated) UTF-8 sequence, open()
>>> does not refuse to create the file. Later readdir() returns a different
>>> name which could not be used to access the file.
>>>
>>> Testcase with U+1F321 (Thermometer):
>>>
>>> $ uname -r
>>> 3.5.4-1.x86_64
>>>
>>> $ printf $'\U0001F321' | od -A none -t x1
>>> f0 9f 8c a1
>>>
>>> $ touch 'file1-'$'\xf0\x9f\x8c\xa1''.ext'
>>>
>>> $ touch 'file2-'$'\xf0\x9f\x8c''.ext'
>>>
>>> $ touch 'file3-'$'\xf0\x9f\x8c'
>>>
>>> $ ls -1
>>> ls: cannot access 'file2-.?ext': No such file or directory
>>> ls: cannot access 'file3-': No such file or directory
>>> 'file1-'$'\360\237\214\241''.ext'
>>> file2-.?ext
>>> file3-
>>>
>>>
>>> Name mapping according to "fhandler_disk_file::readdir" strace lines:
>>>
>>> "file1-\xF0\x9F\x8C\xA1.ext" -(open)-> L"file1-\xD83C\xDF21.ext"
>>> -(readdir)->
>>> "file1-\xF0\x9F\x8C\xA1.ext"
>>>
>>> "file2-\xF0\x9f\x8C.ext" -(open)-> L"file2-\xD83C\xF02Eext" -(readdir)->
>>> "file2-.\xE1\x9E\xB3ext"
>>>
>>> "file3-\xF0\x9F\x8C" -(open)-> L"file3-\xD83C\xF000" -(readdir)->
>>> "file3-"
> I don't know exactly where this happens, but the input of the
> conversion is invalid UTF-8 because it's missing the 4th byte.
> There's no way to represent these filenames on Windows
> filesystems storing filenames as UTF-16 values.
>
> So the problem here is that the conversion somehow misses that
> the 4th byte is invalid and just plods forward and converts the
> leading three bytes into the matching high surrogate value and
> then stumbles over the conversion for the low surrogate.
>
> It would be really helpful to have an STC for this problem.
With some trial and error I found a testcase for this more serious
problem reported yesterday but not quoted above:
>
>> In cases like file3-... above, the converted Windows path ends with
>> 0xF000. This suggests that this is an accidental conversion of the
>> terminating null to the 0xF0xx range.
>>
>> In some cases, the created Windows file name has random garbage
>> behind the 0xF000. Then even Cygwin is not able to access or unlink
>> the file after creation.
Testcase (attached):
$ uname -r
3.7.0-0.160.g922719ba36e0.x86_64
$ gcc -o badname badname.c
$ ./badname
unlink() failed, errno=2, Win path: L"t-\xda01\xf000a"
unlink() failed, errno=2, Win path: L"t-\xda01\xf000b"
unlink() failed, errno=2, Win path: L"t-\xda01\xf000c"
unlink() failed, errno=2, Win path: L"t-\xda01\xf000d"
unlink() failed, errno=2, Win path: L"t-\xda01\xf000e"
unlink() failed, errno=2, Win path: L"t-\xda01\xf000f"
unlink() failed, errno=2, Win path: L"t-\xda01\xf000g"
unlink() failed, errno=2, Win path: L"t-\xda01\xf000h"
unlink() failed, errno=2, Win path: L"t-\xda01\xf000i"
unlink() failed, errno=2, Win path: L"t-\xda01\xf000j"
Conclusion: The terminating null char is accidentally converted to
0xF000 and no new null is appended. A trailing fragment of a previously
used path appears.
>> In fortunately very rare cases, the created Windows file is not
>> accessible from Win32 layer itself because it looks like
>> L"file3-\xD83C\xF000garbage."
>> or
>> L"file3-\xD83C\xF000garbage "
>> which is invalid on Win32 layer due to trailing '.' or space. Then a
>> tool which removes the file via Nt*() layer is required.
Testcase: enable one of the "DON'T DO THIS" lines and make sure that a
suitable file removal tool is available :-)
--
Regards,
Christian
-------------- next part --------------
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <wchar.h>
#include <windows.h>
static void print_w(FILE * f, const wchar_t * s)
{
fputs("L\"", f);
wchar_t c;
for (int i = 0; (c = s[i]); i++) {
if (c == L'"' || c == L'\\')
fprintf(f, "\\%c", c);
else if (L' ' <= c && c <= L'~')
fputc(c, f);
else
fprintf(f, "\\x%04x", c & 0xffff);
}
fputc('"', f);
}
static void get_winname(wchar_t * name)
{
WIN32_FIND_DATAW e;
HANDLE h = FindFirstFileW(L"*", &e);
if (h == INVALID_HANDLE_VALUE) {
fprintf(stderr, "FindFirstFileW(): Error=%u\n", GetLastError());
exit(1);
}
int i = 0;
do {
if (!wcscmp(e.cFileName, L".") || !wcscmp(e.cFileName, L".."))
continue;
if (++i > 1) {
fprintf(stderr, "Error: more than one Win32 file found\n");
exit(1);
}
wcscpy(name, e.cFileName);
} while (FindNextFileW(h, &e));
FindClose(h);
}
static void testname(const char * name)
{
int fd = open(name, O_WRONLY|O_CREAT, 0666);
if (fd < 0) {
printf("open() failed, errno=%d\n", errno);
return;
}
close(fd);
wchar_t winname[MAX_PATH];
get_winname(winname);
if (!unlink(name))
return;
printf("unlink() failed, errno=%d, Win path: ", errno);
print_w(stdout, winname); printf("\n");
if (!DeleteFileW(winname)) {
printf("FATAL: DeleteFileW() failed, error=%u\n", GetLastError());
exit(1);
}
}
int main()
{
const char * dir = "test.tmp";
rmdir(dir);
if (mkdir(dir, 0666)) {
perror(dir); return 1;
}
if (chdir(dir)) {
perror(dir); return 1;
}
for (int i = 0; i < 10; i++) {
const char name[] = "t-\xf2\x90\x90";
char prev[sizeof(name)+2];
memset(prev, 'X', sizeof(prev)-2); prev[sizeof(prev)-1] = 0;
prev[sizeof(name)] = 'a' + (i % 26);
//prev[sizeof(name)] = '.'; // DON'T DO THIS!
//prev[sizeof(name)] = ' '; // DON'T DO THIS!
access(prev, 0);
testname(name);
}
return 1;
}
More information about the Cygwin
mailing list