[RFC PATCH] Windows: Assume all file names to be UTF-8 encoded.

Discussion:

Peter Krefting

2009-03-02 08:47:22 UTC

When opening a file through open() or fopen(), the path passed is
UTF-8 encoded. To handle this on Windows, we need to convert the
path string to UTF-16 and use the Unicode-based interface.
---
Windows does support file names using arbitrary Unicode characters, you just
need to use its wchar_t interfaces instead of the char ones (the char ones
just gets converted into wchar_t on the API level anyway, for the same
reasons). This is the beginnings of support for UTF-8 file names on Git on
Windows.

Since there is no real file system abstraction beyond using stdio (AFAIK), I
need to hack it by replacing fopen (and open). Probably opendir/readdir as
well (might be trickier), and possibly even hack around main() to parse the
wchar_t command-line instead of the char copy.

This will lose all chances of Windows 9x compatibility, but I don't know if
there are any attempts of supporting it anyway?

Please note that MultiByteToWideChar() will reject any invalid UTF-8
strings, perhaps it should just fall back to a regular open()/fopen() in
that case?

No Signed-Off line since this is unfinished, just presenting rough sketches
of an idea.

compat/mingw.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
compat/mingw.h | 3 ++
2 files changed, 62 insertions(+), 1 deletions(-)

diff --git a/compat/mingw.c b/compat/mingw.c
index e25cb4f..8b19b80 100644
--- a/compat/mingw.c
+++ b/compat/mingw.c
@@ -9,13 +9,30 @@ int mingw_open (const char *filename, int oflags, ...)
{
va_list args;
unsigned mode;
+ wchar_t *unicode_filename;
+ int unicode_filename_len;
va_start(args, oflags);
mode = va_arg(args, int);
va_end(args);

if (!strcmp(filename, "/dev/null"))
filename = "nul";
- int fd = open(filename, oflags, mode);
+
+ unicode_filename_len = MultiByteToWideChar(CP_UTF8, 0, filename, -1, NULL, 0);
+ if (0 == unicode_filename_len) {
+ errno = EINVAL;
+ return -1;
+ };
+
+ unicode_filename = xmalloc(unicode_filename_len * sizeof (wchar_t));
+ if (NULL == unicode_filename) {
+ errno = ENOMEM;
+ return -1;
+ }
+ MultiByteToWideChar(CP_UTF8, 0, filename, -1, unicode_filename, unicode_filename_len);
+ int fd = _wopen(unicode_filename, oflags, mode);
+ free(unicode_filename);
+
if (fd < 0 && (oflags & O_CREAT) && errno == EACCES) {
DWORD attrs = GetFileAttributes(filename);
if (attrs != INVALID_FILE_ATTRIBUTES && (attrs & FILE_ATTRIBUTE_DIRECTORY))
@@ -24,6 +41,47 @@ int mingw_open (const char *filename, int oflags, ...)
return fd;
}

+FILE *mingw_fopen (const char *filename, const char *mode)
+{
+ wchar_t *unicode_filename, *unicode_mode;
+ int unicode_filename_len, unicode_mode_len;
+ FILE *fh;
+
+ unicode_filename_len = MultiByteToWideChar(CP_UTF8, 0, filename, -1, NULL, 0);
+ if (0 == unicode_filename_len) {
+ errno = EINVAL;
+ return NULL;
+ };
+
+ unicode_filename = xmalloc(unicode_filename_len * sizeof (wchar_t));
+ if (NULL == unicode_filename) {
+ errno = ENOMEM;
+ return NULL;
+ }
+ MultiByteToWideChar(CP_UTF8, 0, filename, -1, unicode_filename, unicode_filename_len);
+
+ unicode_mode_len = MultiByteToWideChar(CP_UTF8, 0, mode, -1, NULL, 0);
+ if (0 == unicode_mode_len) {
+ free(unicode_filename);
+ errno = EINVAL;
+ return NULL;
+ };
+
+ unicode_mode = xmalloc(unicode_mode_len * sizeof (wchar_t));
+ if (NULL == unicode_mode) {
+ free(unicode_mode);
+ errno = ENOMEM;
+ return NULL;
+ }
+ MultiByteToWideChar(CP_UTF8, 0, mode, -1, unicode_mode, unicode_mode_len);
+
+ fh = _wfopen(unicode_filename, unicode_mode);
+ free(unicode_filename);
+ free(unicode_mode);
+
+ return fh;
+}
+
static inline time_t filetime_to_time_t(const FILETIME *ft)
{
long long winTime = ((long long)ft->dwHighDateTime << 32) + ft->dwLowDateTime;
diff --git a/compat/mingw.h b/compat/mingw.h
index 4f275cb..235df0a 100644
--- a/compat/mingw.h
+++ b/compat/mingw.h
@@ -142,6 +142,9 @@ int sigaction(int sig, struct sigaction *in, struct sigaction *out);
int mingw_open (const char *filename, int oflags, ...);
#define open mingw_open

+FILE *mingw_fopen (const char *filename, const char *mode);
+#define fopen mingw_fopen
+
char *mingw_getcwd(char *pointer, int len);
#define getcwd mingw_getcwd

--
1.6.0.2.1172.ga5ed0

Johannes Sixt

2009-03-02 10:30:01 UTC