Blame view
fs/udf/unicode.c
10.8 KB
1da177e4c
|
1 2 3 4 5 6 7 8 9 10 11 12 13 |
/* * unicode.c * * PURPOSE * Routines for converting between UTF-8 and OSTA Compressed Unicode. * Also handles filename mangling * * DESCRIPTION * OSTA Compressed Unicode is explained in the OSTA UDF specification. * http://www.osta.org/ * UTF-8 is explained in the IETF RFC XXXX. * ftp://ftp.internic.net/rfc/rfcxxxx.txt * |
1da177e4c
|
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
* COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: * ftp://prep.ai.mit.edu/pub/gnu/GPL * Each contributing author retains all rights to their own work. */ #include "udfdecl.h" #include <linux/kernel.h> #include <linux/string.h> /* for memset */ #include <linux/nls.h> #include <linux/udf_fs.h> #include "udf_sb.h" static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int); |
cb00ea352
|
31 |
static int udf_char_to_ustr(struct ustr *dest, const uint8_t * src, int strlen) |
1da177e4c
|
32 |
{ |
cb00ea352
|
33 |
if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2)) |
1da177e4c
|
34 35 36 37 38 39 40 41 42 43 44 |
return 0; memset(dest, 0, sizeof(struct ustr)); memcpy(dest->u_name, src, strlen); dest->u_cmpID = 0x08; dest->u_len = strlen; return strlen; } /* * udf_build_ustr */ |
cb00ea352
|
45 |
int udf_build_ustr(struct ustr *dest, dstring * ptr, int size) |
1da177e4c
|
46 47 |
{ int usesize; |
cb00ea352
|
48 |
if ((!dest) || (!ptr) || (!size)) |
1da177e4c
|
49 50 51 |
return -1; memset(dest, 0, sizeof(struct ustr)); |
cb00ea352
|
52 53 54 55 |
usesize = (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size; dest->u_cmpID = ptr[0]; dest->u_len = ptr[size - 1]; memcpy(dest->u_name, ptr + 1, usesize - 1); |
1da177e4c
|
56 57 58 59 60 61 |
return 0; } /* * udf_build_ustr_exact */ |
cb00ea352
|
62 |
static int udf_build_ustr_exact(struct ustr *dest, dstring * ptr, int exactsize) |
1da177e4c
|
63 |
{ |
cb00ea352
|
64 |
if ((!dest) || (!ptr) || (!exactsize)) |
1da177e4c
|
65 66 67 |
return -1; memset(dest, 0, sizeof(struct ustr)); |
cb00ea352
|
68 69 70 |
dest->u_cmpID = ptr[0]; dest->u_len = exactsize - 1; memcpy(dest->u_name, ptr + 1, exactsize - 1); |
1da177e4c
|
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
return 0; } /* * udf_ocu_to_utf8 * * PURPOSE * Convert OSTA Compressed Unicode to the UTF-8 equivalent. * * DESCRIPTION * This routine is only called by udf_filldir(). * * PRE-CONDITIONS * utf Pointer to UTF-8 output buffer. * ocu Pointer to OSTA Compressed Unicode input buffer * of size UDF_NAME_LEN bytes. * both of type "struct ustr *" * * POST-CONDITIONS * <return> Zero on success. * * HISTORY * November 12, 1997 - Andrew E. Mileski * Written, tested, and released. */ int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i) { uint8_t *ocu; uint32_t c; uint8_t cmp_id, ocu_len; int i; ocu = ocu_i->u_name; ocu_len = ocu_i->u_len; cmp_id = ocu_i->u_cmpID; utf_o->u_len = 0; |
cb00ea352
|
108 |
if (ocu_len == 0) { |
1da177e4c
|
109 110 111 112 113 |
memset(utf_o, 0, sizeof(struct ustr)); utf_o->u_cmpID = 0; utf_o->u_len = 0; return 0; } |
cb00ea352
|
114 115 116 117 |
if ((cmp_id != 8) && (cmp_id != 16)) { printk(KERN_ERR "udf: unknown compression code (%d) stri=%s ", cmp_id, ocu_i->u_name); |
1da177e4c
|
118 119 |
return 0; } |
cb00ea352
|
120 |
for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { |
1da177e4c
|
121 122 123 124 125 126 127 128 |
/* Expand OSTA compressed Unicode to Unicode */ c = ocu[i++]; if (cmp_id == 16) c = (c << 8) | ocu[i++]; /* Compress Unicode to UTF-8 */ if (c < 0x80U) |
cb00ea352
|
129 130 131 132 133 134 135 136 137 138 139 140 141 |
utf_o->u_name[utf_o->u_len++] = (uint8_t) c; else if (c < 0x800U) { utf_o->u_name[utf_o->u_len++] = (uint8_t) (0xc0 | (c >> 6)); utf_o->u_name[utf_o->u_len++] = (uint8_t) (0x80 | (c & 0x3f)); } else { utf_o->u_name[utf_o->u_len++] = (uint8_t) (0xe0 | (c >> 12)); utf_o->u_name[utf_o->u_len++] = (uint8_t) (0x80 | ((c >> 6) & 0x3f)); utf_o->u_name[utf_o->u_len++] = (uint8_t) (0x80 | (c & 0x3f)); |
1da177e4c
|
142 143 |
} } |
cb00ea352
|
144 |
utf_o->u_cmpID = 8; |
1da177e4c
|
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
return utf_o->u_len; } /* * * udf_utf8_to_ocu * * PURPOSE * Convert UTF-8 to the OSTA Compressed Unicode equivalent. * * DESCRIPTION * This routine is only called by udf_lookup(). * * PRE-CONDITIONS * ocu Pointer to OSTA Compressed Unicode output * buffer of size UDF_NAME_LEN bytes. * utf Pointer to UTF-8 input buffer. * utf_len Length of UTF-8 input buffer in bytes. * * POST-CONDITIONS * <return> Zero on success. * * HISTORY * November 12, 1997 - Andrew E. Mileski * Written, tested, and released. */ |
cb00ea352
|
172 |
static int udf_UTF8toCS0(dstring * ocu, struct ustr *utf, int length) |
1da177e4c
|
173 174 175 176 177 178 179 |
{ unsigned c, i, max_val, utf_char; int utf_cnt, u_len; memset(ocu, 0, sizeof(dstring) * length); ocu[0] = 8; max_val = 0xffU; |
cb00ea352
|
180 |
try_again: |
1da177e4c
|
181 182 183 |
u_len = 0U; utf_char = 0U; utf_cnt = 0U; |
cb00ea352
|
184 185 |
for (i = 0U; i < utf->u_len; i++) { c = (uint8_t) utf->u_name[i]; |
1da177e4c
|
186 187 |
/* Complete a multi-byte UTF-8 character */ |
cb00ea352
|
188 |
if (utf_cnt) { |
1da177e4c
|
189 190 191 |
utf_char = (utf_char << 6) | (c & 0x3fU); if (--utf_cnt) continue; |
cb00ea352
|
192 |
} else { |
1da177e4c
|
193 |
/* Check for a multi-byte UTF-8 character */ |
cb00ea352
|
194 |
if (c & 0x80U) { |
1da177e4c
|
195 |
/* Start a multi-byte UTF-8 character */ |
cb00ea352
|
196 |
if ((c & 0xe0U) == 0xc0U) { |
1da177e4c
|
197 198 |
utf_char = c & 0x1fU; utf_cnt = 1; |
cb00ea352
|
199 |
} else if ((c & 0xf0U) == 0xe0U) { |
1da177e4c
|
200 201 |
utf_char = c & 0x0fU; utf_cnt = 2; |
cb00ea352
|
202 |
} else if ((c & 0xf8U) == 0xf0U) { |
1da177e4c
|
203 204 |
utf_char = c & 0x07U; utf_cnt = 3; |
cb00ea352
|
205 |
} else if ((c & 0xfcU) == 0xf8U) { |
1da177e4c
|
206 207 |
utf_char = c & 0x03U; utf_cnt = 4; |
cb00ea352
|
208 |
} else if ((c & 0xfeU) == 0xfcU) { |
1da177e4c
|
209 210 |
utf_char = c & 0x01U; utf_cnt = 5; |
cb00ea352
|
211 |
} else |
1da177e4c
|
212 213 214 215 216 217 218 219 |
goto error_out; continue; } else /* Single byte UTF-8 character (most common) */ utf_char = c; } /* Choose no compression if necessary */ |
cb00ea352
|
220 221 |
if (utf_char > max_val) { if (0xffU == max_val) { |
1da177e4c
|
222 |
max_val = 0xffffU; |
cb00ea352
|
223 |
ocu[0] = (uint8_t) 0x10U; |
1da177e4c
|
224 225 226 227 |
goto try_again; } goto error_out; } |
cb00ea352
|
228 229 |
if (max_val == 0xffffU) { ocu[++u_len] = (uint8_t) (utf_char >> 8); |
1da177e4c
|
230 |
} |
cb00ea352
|
231 |
ocu[++u_len] = (uint8_t) (utf_char & 0xffU); |
1da177e4c
|
232 |
} |
cb00ea352
|
233 234 |
if (utf_cnt) { error_out: |
1da177e4c
|
235 236 237 238 |
ocu[++u_len] = '?'; printk(KERN_DEBUG "udf: bad UTF-8 character "); } |
cb00ea352
|
239 |
ocu[length - 1] = (uint8_t) u_len + 1; |
1da177e4c
|
240 241 |
return u_len + 1; } |
cb00ea352
|
242 243 |
static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i) |
1da177e4c
|
244 245 246 247 248 249 250 251 252 253 254 |
{ uint8_t *ocu; uint32_t c; uint8_t cmp_id, ocu_len; int i; ocu = ocu_i->u_name; ocu_len = ocu_i->u_len; cmp_id = ocu_i->u_cmpID; utf_o->u_len = 0; |
cb00ea352
|
255 |
if (ocu_len == 0) { |
1da177e4c
|
256 257 258 259 260 |
memset(utf_o, 0, sizeof(struct ustr)); utf_o->u_cmpID = 0; utf_o->u_len = 0; return 0; } |
cb00ea352
|
261 262 263 264 |
if ((cmp_id != 8) && (cmp_id != 16)) { printk(KERN_ERR "udf: unknown compression code (%d) stri=%s ", cmp_id, ocu_i->u_name); |
1da177e4c
|
265 266 |
return 0; } |
cb00ea352
|
267 |
for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { |
1da177e4c
|
268 269 270 271 |
/* Expand OSTA compressed Unicode to Unicode */ c = ocu[i++]; if (cmp_id == 16) c = (c << 8) | ocu[i++]; |
cb00ea352
|
272 273 |
utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], UDF_NAME_LEN - utf_o->u_len); |
1da177e4c
|
274 |
} |
cb00ea352
|
275 |
utf_o->u_cmpID = 8; |
1da177e4c
|
276 277 278 |
return utf_o->u_len; } |
cb00ea352
|
279 280 |
static int udf_NLStoCS0(struct nls_table *nls, dstring * ocu, struct ustr *uni, int length) |
1da177e4c
|
281 282 283 284 285 286 287 288 |
{ unsigned len, i, max_val; uint16_t uni_char; int u_len; memset(ocu, 0, sizeof(dstring) * length); ocu[0] = 8; max_val = 0xffU; |
cb00ea352
|
289 |
try_again: |
1da177e4c
|
290 |
u_len = 0U; |
cb00ea352
|
291 292 |
for (i = 0U; i < uni->u_len; i++) { len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); |
1da177e4c
|
293 294 |
if (len <= 0) continue; |
cb00ea352
|
295 |
if (uni_char > max_val) { |
1da177e4c
|
296 |
max_val = 0xffffU; |
cb00ea352
|
297 |
ocu[0] = (uint8_t) 0x10U; |
1da177e4c
|
298 299 |
goto try_again; } |
cb00ea352
|
300 |
|
1da177e4c
|
301 |
if (max_val == 0xffffU) |
cb00ea352
|
302 303 |
ocu[++u_len] = (uint8_t) (uni_char >> 8); ocu[++u_len] = (uint8_t) (uni_char & 0xffU); |
1da177e4c
|
304 305 |
i += len - 1; } |
cb00ea352
|
306 |
ocu[length - 1] = (uint8_t) u_len + 1; |
1da177e4c
|
307 308 |
return u_len + 1; } |
cb00ea352
|
309 310 |
int udf_get_filename(struct super_block *sb, uint8_t * sname, uint8_t * dname, int flen) |
1da177e4c
|
311 312 313 |
{ struct ustr filename, unifilename; int len; |
cb00ea352
|
314 |
if (udf_build_ustr_exact(&unifilename, sname, flen)) { |
1da177e4c
|
315 316 |
return 0; } |
cb00ea352
|
317 318 319 320 321 |
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { if (!udf_CS0toUTF8(&filename, &unifilename)) { udf_debug("Failed in udf_get_filename: sname = %s ", sname); |
1da177e4c
|
322 323 |
return 0; } |
cb00ea352
|
324 325 326 327 328 329 |
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { if (!udf_CS0toNLS (UDF_SB(sb)->s_nls_map, &filename, &unifilename)) { udf_debug("Failed in udf_get_filename: sname = %s ", sname); |
1da177e4c
|
330 331 |
return 0; } |
cb00ea352
|
332 |
} else |
1da177e4c
|
333 |
return 0; |
cb00ea352
|
334 335 336 |
if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, unifilename.u_name, unifilename.u_len))) { |
1da177e4c
|
337 338 339 340 |
return len; } return 0; } |
cb00ea352
|
341 342 |
int udf_put_filename(struct super_block *sb, const uint8_t * sname, uint8_t * dname, int flen) |
1da177e4c
|
343 344 345 |
{ struct ustr unifilename; int namelen; |
cb00ea352
|
346 |
if (!(udf_char_to_ustr(&unifilename, sname, flen))) { |
1da177e4c
|
347 348 |
return 0; } |
cb00ea352
|
349 350 351 352 |
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { if (! (namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN))) { |
1da177e4c
|
353 354 |
return 0; } |
cb00ea352
|
355 356 357 358 359 |
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { if (! (namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, &unifilename, UDF_NAME_LEN))) { |
1da177e4c
|
360 361 |
return 0; } |
cb00ea352
|
362 |
} else |
1da177e4c
|
363 364 365 366 367 368 369 370 371 |
return 0; return namelen; } #define ILLEGAL_CHAR_MARK '_' #define EXT_MARK '.' #define CRC_MARK '#' #define EXT_SIZE 5 |
cb00ea352
|
372 373 |
static int udf_translate_to_linux(uint8_t * newName, uint8_t * udfName, int udfLen, uint8_t * fidName, int fidNameLen) |
1da177e4c
|
374 |
{ |
cb00ea352
|
375 |
int index, newIndex = 0, needsCRC = 0; |
1da177e4c
|
376 377 378 379 380 381 |
int extIndex = 0, newExtIndex = 0, hasExt = 0; unsigned short valueCRC; uint8_t curr; const uint8_t hexChar[] = "0123456789ABCDEF"; if (udfName[0] == '.' && (udfLen == 1 || |
cb00ea352
|
382 |
(udfLen == 2 && udfName[1] == '.'))) { |
1da177e4c
|
383 384 385 |
needsCRC = 1; newIndex = udfLen; memcpy(newName, udfName, udfLen); |
cb00ea352
|
386 387 |
} else { for (index = 0; index < udfLen; index++) { |
1da177e4c
|
388 |
curr = udfName[index]; |
cb00ea352
|
389 |
if (curr == '/' || curr == 0) { |
1da177e4c
|
390 391 |
needsCRC = 1; curr = ILLEGAL_CHAR_MARK; |
cb00ea352
|
392 393 394 |
while (index + 1 < udfLen && (udfName[index + 1] == '/' || udfName[index + 1] == 0)) |
1da177e4c
|
395 396 |
index++; } |
cb00ea352
|
397 398 |
if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE) { |
1da177e4c
|
399 400 |
if (udfLen == index + 1) hasExt = 0; |
cb00ea352
|
401 |
else { |
1da177e4c
|
402 403 404 405 406 407 408 409 410 411 412 |
hasExt = 1; extIndex = index; newExtIndex = newIndex; } } if (newIndex < 256) newName[newIndex++] = curr; else needsCRC = 1; } } |
cb00ea352
|
413 |
if (needsCRC) { |
1da177e4c
|
414 415 |
uint8_t ext[EXT_SIZE]; int localExtIndex = 0; |
cb00ea352
|
416 |
if (hasExt) { |
1da177e4c
|
417 |
int maxFilenameLen; |
cb00ea352
|
418 419 420 |
for (index = 0; index < EXT_SIZE && extIndex + index + 1 < udfLen; index++) { |
1da177e4c
|
421 |
curr = udfName[extIndex + index + 1]; |
cb00ea352
|
422 |
if (curr == '/' || curr == 0) { |
1da177e4c
|
423 424 |
needsCRC = 1; curr = ILLEGAL_CHAR_MARK; |
cb00ea352
|
425 426 427 428 429 430 431 432 433 |
while (extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE && (udfName [extIndex + index + 2] == '/' || udfName[extIndex + index + 2] == 0))) |
1da177e4c
|
434 435 436 437 438 439 440 441 442 |
index++; } ext[localExtIndex++] = curr; } maxFilenameLen = 250 - localExtIndex; if (newIndex > maxFilenameLen) newIndex = maxFilenameLen; else newIndex = newExtIndex; |
cb00ea352
|
443 |
} else if (newIndex > 250) |
1da177e4c
|
444 445 446 447 448 449 450 |
newIndex = 250; newName[newIndex++] = CRC_MARK; valueCRC = udf_crc(fidName, fidNameLen, 0); newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; |
cb00ea352
|
451 |
if (hasExt) { |
1da177e4c
|
452 |
newName[newIndex++] = EXT_MARK; |
cb00ea352
|
453 |
for (index = 0; index < localExtIndex; index++) |
1da177e4c
|
454 455 456 457 458 |
newName[newIndex++] = ext[index]; } } return newIndex; } |