fs/udf/unicode.c

   1 /*
   2  * unicode.c
   3  *
   4  * PURPOSE
   5  *      Routines for converting between UTF-8 and OSTA Compressed Unicode.
   6  *      Also handles filename mangling
   7  *
   8  * DESCRIPTION
   9  *      OSTA Compressed Unicode is explained in the OSTA UDF specification.
  10  *              http://www.osta.org/
  11  *      UTF-8 is explained in the IETF RFC XXXX.
  12  *              ftp://ftp.internic.net/rfc/rfcxxxx.txt
  13  *
  14  * COPYRIGHT
  15  *      This file is distributed under the terms of the GNU General Public
  16  *      License (GPL). Copies of the GPL can be obtained from:
  17  *              ftp://prep.ai.mit.edu/pub/gnu/GPL
  18  *      Each contributing author retains all rights to their own work.
  19  */
  20
  21 #include "udfdecl.h"
  22
  23 #include <linux/kernel.h>
  24 #include <linux/string.h>       /* for memset */
  25 #include <linux/nls.h>
  26
  27 #include "udf_sb.h"
  28
  29 static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
  30
  31 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
  32 {
  33         if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
  34                 return 0;
  35
  36         memset(dest, 0, sizeof(struct ustr));
  37         memcpy(dest->u_name, src, strlen);
  38         dest->u_cmpID = 0x08;
  39         dest->u_len = strlen;
  40
  41         return strlen;
  42 }
  43
  44 /*
  45  * udf_build_ustr
  46  */
  47 int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
  48 {
  49         int usesize;
  50
  51         if (!dest || !ptr || !size)
  52                 return -1;
  53         BUG_ON(size < 2);
  54
  55         usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
  56         usesize = min(usesize, size - 2);
  57         dest->u_cmpID = ptr[0];
  58         dest->u_len = usesize;
  59         memcpy(dest->u_name, ptr + 1, usesize);
  60         memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
  61
  62         return 0;
  63 }
  64
  65 /*
  66  * udf_build_ustr_exact
  67  */
  68 static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
  69 {
  70         if ((!dest) || (!ptr) || (!exactsize))
  71                 return -1;
  72
  73         memset(dest, 0, sizeof(struct ustr));
  74         dest->u_cmpID = ptr[0];
  75         dest->u_len = exactsize - 1;
  76         memcpy(dest->u_name, ptr + 1, exactsize - 1);
  77
  78         return 0;
  79 }
  80
  81 /*
  82  * udf_ocu_to_utf8
  83  *
  84  * PURPOSE
  85  *      Convert OSTA Compressed Unicode to the UTF-8 equivalent.
  86  *
  87  * PRE-CONDITIONS
  88  *      utf                     Pointer to UTF-8 output buffer.
  89  *      ocu                     Pointer to OSTA Compressed Unicode input buffer
  90  *                              of size UDF_NAME_LEN bytes.
  91  *                              both of type "struct ustr *"
  92  *
  93  * POST-CONDITIONS
  94  *      <return>                Zero on success.
  95  *
  96  * HISTORY
  97  *      November 12, 1997 - Andrew E. Mileski
  98  *      Written, tested, and released.
  99  */
 100 int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
 101 {
 102         const uint8_t *ocu;
 103         uint8_t cmp_id, ocu_len;
 104         int i;
 105
 106         ocu_len = ocu_i->u_len;
 107         if (ocu_len == 0) {
 108                 memset(utf_o, 0, sizeof(struct ustr));
 109                 return 0;
 110         }
 111
 112         cmp_id = ocu_i->u_cmpID;
 113         if (cmp_id != 8 && cmp_id != 16) {
 114                 memset(utf_o, 0, sizeof(struct ustr));
 115                 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
 116                        cmp_id, ocu_i->u_name);
 117                 return 0;
 118         }
 119
 120         ocu = ocu_i->u_name;
 121         utf_o->u_len = 0;
 122         for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
 123
 124                 /* Expand OSTA compressed Unicode to Unicode */
 125                 uint32_t c = ocu[i++];
 126                 if (cmp_id == 16)
 127                         c = (c << 8) | ocu[i++];
 128
 129                 /* Compress Unicode to UTF-8 */
 130                 if (c < 0x80U)
 131                         utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
 132                 else if (c < 0x800U) {
 133                         utf_o->u_name[utf_o->u_len++] =
 134                                                 (uint8_t)(0xc0 | (c >> 6));
 135                         utf_o->u_name[utf_o->u_len++] =
 136                                                 (uint8_t)(0x80 | (c & 0x3f));
 137                 } else {
 138                         utf_o->u_name[utf_o->u_len++] =
 139                                                 (uint8_t)(0xe0 | (c >> 12));
 140                         utf_o->u_name[utf_o->u_len++] =
 141                                                 (uint8_t)(0x80 |
 142                                                           ((c >> 6) & 0x3f));
 143                         utf_o->u_name[utf_o->u_len++] =
 144                                                 (uint8_t)(0x80 | (c & 0x3f));
 145                 }
 146         }
 147         utf_o->u_cmpID = 8;
 148
 149         return utf_o->u_len;
 150 }
 151
 152 /*
 153  *
 154  * udf_utf8_to_ocu
 155  *
 156  * PURPOSE
 157  *      Convert UTF-8 to the OSTA Compressed Unicode equivalent.
 158  *
 159  * DESCRIPTION
 160  *      This routine is only called by udf_lookup().
 161  *
 162  * PRE-CONDITIONS
 163  *      ocu                     Pointer to OSTA Compressed Unicode output
 164  *                              buffer of size UDF_NAME_LEN bytes.
 165  *      utf                     Pointer to UTF-8 input buffer.
 166  *      utf_len                 Length of UTF-8 input buffer in bytes.
 167  *
 168  * POST-CONDITIONS
 169  *      <return>                Zero on success.
 170  *
 171  * HISTORY
 172  *      November 12, 1997 - Andrew E. Mileski
 173  *      Written, tested, and released.
 174  */
 175 static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
 176 {
 177         unsigned c, i, max_val, utf_char;
 178         int utf_cnt, u_len;
 179
 180         memset(ocu, 0, sizeof(dstring) * length);
 181         ocu[0] = 8;
 182         max_val = 0xffU;
 183
 184 try_again:
 185         u_len = 0U;
 186         utf_char = 0U;
 187         utf_cnt = 0U;
 188         for (i = 0U; i < utf->u_len; i++) {
 189                 c = (uint8_t)utf->u_name[i];
 190
 191                 /* Complete a multi-byte UTF-8 character */
 192                 if (utf_cnt) {
 193                         utf_char = (utf_char << 6) | (c & 0x3fU);
 194                         if (--utf_cnt)
 195                                 continue;
 196                 } else {
 197                         /* Check for a multi-byte UTF-8 character */
 198                         if (c & 0x80U) {
 199                                 /* Start a multi-byte UTF-8 character */
 200                                 if ((c & 0xe0U) == 0xc0U) {
 201                                         utf_char = c & 0x1fU;
 202                                         utf_cnt = 1;
 203                                 } else if ((c & 0xf0U) == 0xe0U) {
 204                                         utf_char = c & 0x0fU;
 205                                         utf_cnt = 2;
 206                                 } else if ((c & 0xf8U) == 0xf0U) {
 207                                         utf_char = c & 0x07U;
 208                                         utf_cnt = 3;
 209                                 } else if ((c & 0xfcU) == 0xf8U) {
 210                                         utf_char = c & 0x03U;
 211                                         utf_cnt = 4;
 212                                 } else if ((c & 0xfeU) == 0xfcU) {
 213                                         utf_char = c & 0x01U;
 214                                         utf_cnt = 5;
 215                                 } else {
 216                                         goto error_out;
 217                                 }
 218                                 continue;
 219                         } else {
 220                                 /* Single byte UTF-8 character (most common) */
 221                                 utf_char = c;
 222                         }
 223                 }
 224
 225                 /* Choose no compression if necessary */
 226                 if (utf_char > max_val) {
 227                         if (max_val == 0xffU) {
 228                                 max_val = 0xffffU;
 229                                 ocu[0] = (uint8_t)0x10U;
 230                                 goto try_again;
 231                         }
 232                         goto error_out;
 233                 }
 234
 235                 if (max_val == 0xffffU)
 236                         ocu[++u_len] = (uint8_t)(utf_char >> 8);
 237                 ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
 238         }
 239
 240         if (utf_cnt) {
 241 error_out:
 242                 ocu[++u_len] = '?';
 243                 printk(KERN_DEBUG "udf: bad UTF-8 character\n");
 244         }
 245
 246         ocu[length - 1] = (uint8_t)u_len + 1;
 247
 248         return u_len + 1;
 249 }
 250
 251 static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 252                         const struct ustr *ocu_i)
 253 {
 254         const uint8_t *ocu;
 255         uint8_t cmp_id, ocu_len;
 256         int i;
 257
 258
 259         ocu_len = ocu_i->u_len;
 260         if (ocu_len == 0) {
 261                 memset(utf_o, 0, sizeof(struct ustr));
 262                 return 0;
 263         }
 264
 265         cmp_id = ocu_i->u_cmpID;
 266         if (cmp_id != 8 && cmp_id != 16) {
 267                 memset(utf_o, 0, sizeof(struct ustr));
 268                 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
 269                        cmp_id, ocu_i->u_name);
 270                 return 0;
 271         }
 272
 273         ocu = ocu_i->u_name;
 274         utf_o->u_len = 0;
 275         for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
 276                 /* Expand OSTA compressed Unicode to Unicode */
 277                 uint32_t c = ocu[i++];
 278                 if (cmp_id == 16)
 279                         c = (c << 8) | ocu[i++];
 280
 281                 utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
 282                                               UDF_NAME_LEN - utf_o->u_len);
 283         }
 284         utf_o->u_cmpID = 8;
 285
 286         return utf_o->u_len;
 287 }
 288
 289 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
 290                         int length)
 291 {
 292         unsigned len, i, max_val;
 293         uint16_t uni_char;
 294         int u_len;
 295
 296         memset(ocu, 0, sizeof(dstring) * length);
 297         ocu[0] = 8;
 298         max_val = 0xffU;
 299
 300 try_again:
 301         u_len = 0U;
 302         for (i = 0U; i < uni->u_len; i++) {
 303                 len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
 304                 if (len <= 0)
 305                         continue;
 306
 307                 if (uni_char > max_val) {
 308                         max_val = 0xffffU;
 309                         ocu[0] = (uint8_t)0x10U;
 310                         goto try_again;
 311                 }
 312
 313                 if (max_val == 0xffffU)
 314                         ocu[++u_len] = (uint8_t)(uni_char >> 8);
 315                 ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
 316                 i += len - 1;
 317         }
 318
 319         ocu[length - 1] = (uint8_t)u_len + 1;
 320         return u_len + 1;
 321 }
 322
 323 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
 324                      int flen)
 325 {
 326         struct ustr filename, unifilename;
 327         int len;
 328
 329         if (udf_build_ustr_exact(&unifilename, sname, flen))
 330                 return 0;
 331
 332         if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
 333                 if (!udf_CS0toUTF8(&filename, &unifilename)) {
 334                         udf_debug("Failed in udf_get_filename: sname = %s\n",
 335                                   sname);
 336                         return 0;
 337                 }
 338         } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
 339                 if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename,
 340                                   &unifilename)) {
 341                         udf_debug("Failed in udf_get_filename: sname = %s\n",
 342                                   sname);
 343                         return 0;
 344                 }
 345         } else
 346                 return 0;
 347
 348         len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
 349                                      unifilename.u_name, unifilename.u_len);
 350         if (len)
 351                 return len;
 352
 353         return 0;
 354 }
 355
 356 int udf_put_filename(struct super_block *sb, const uint8_t *sname,
 357                      uint8_t *dname, int flen)
 358 {
 359         struct ustr unifilename;
 360         int namelen;
 361
 362         if (!udf_char_to_ustr(&unifilename, sname, flen))
 363                 return 0;
 364
 365         if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
 366                 namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
 367                 if (!namelen)
 368                         return 0;
 369         } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
 370                 namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
 371                                         &unifilename, UDF_NAME_LEN);
 372                 if (!namelen)
 373                         return 0;
 374         } else
 375                 return 0;
 376
 377         return namelen;
 378 }
 379
 380 #define ILLEGAL_CHAR_MARK       '_'
 381 #define EXT_MARK                '.'
 382 #define CRC_MARK                '#'
 383 #define EXT_SIZE                5
 384
 385 static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
 386                                   int udfLen, uint8_t *fidName,
 387                                   int fidNameLen)
 388 {
 389         int index, newIndex = 0, needsCRC = 0;
 390         int extIndex = 0, newExtIndex = 0, hasExt = 0;
 391         unsigned short valueCRC;
 392         uint8_t curr;
 393         const uint8_t hexChar[] = "0123456789ABCDEF";
 394
 395         if (udfName[0] == '.' &&
 396             (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
 397                 needsCRC = 1;
 398                 newIndex = udfLen;
 399                 memcpy(newName, udfName, udfLen);
 400         } else {
 401                 for (index = 0; index < udfLen; index++) {
 402                         curr = udfName[index];
 403                         if (curr == '/' || curr == 0) {
 404                                 needsCRC = 1;
 405                                 curr = ILLEGAL_CHAR_MARK;
 406                                 while (index + 1 < udfLen &&
 407                                                 (udfName[index + 1] == '/' ||
 408                                                  udfName[index + 1] == 0))
 409                                         index++;
 410                         }
 411                         if (curr == EXT_MARK &&
 412                                         (udfLen - index - 1) <= EXT_SIZE) {
 413                                 if (udfLen == index + 1)
 414                                         hasExt = 0;
 415                                 else {
 416                                         hasExt = 1;
 417                                         extIndex = index;
 418                                         newExtIndex = newIndex;
 419                                 }
 420                         }
 421                         if (newIndex < 256)
 422                                 newName[newIndex++] = curr;
 423                         else
 424                                 needsCRC = 1;
 425                 }
 426         }
 427         if (needsCRC) {
 428                 uint8_t ext[EXT_SIZE];
 429                 int localExtIndex = 0;
 430
 431                 if (hasExt) {
 432                         int maxFilenameLen;
 433                         for (index = 0;
 434                              index < EXT_SIZE && extIndex + index + 1 < udfLen;
 435                              index++) {
 436                                 curr = udfName[extIndex + index + 1];
 437
 438                                 if (curr == '/' || curr == 0) {
 439                                         needsCRC = 1;
 440                                         curr = ILLEGAL_CHAR_MARK;
 441                                         while (extIndex + index + 2 < udfLen &&
 442                                               (index + 1 < EXT_SIZE &&
 443                                                 (udfName[extIndex + index + 2] == '/' ||
 444                                                  udfName[extIndex + index + 2] == 0)))
 445                                                 index++;
 446                                 }
 447                                 ext[localExtIndex++] = curr;
 448                         }
 449                         maxFilenameLen = 250 - localExtIndex;
 450                         if (newIndex > maxFilenameLen)
 451                                 newIndex = maxFilenameLen;
 452                         else
 453                                 newIndex = newExtIndex;
 454                 } else if (newIndex > 250)
 455                         newIndex = 250;
 456                 newName[newIndex++] = CRC_MARK;
 457                 valueCRC = udf_crc(fidName, fidNameLen, 0);
 458                 newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
 459                 newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
 460                 newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
 461                 newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
 462
 463                 if (hasExt) {
 464                         newName[newIndex++] = EXT_MARK;
 465                         for (index = 0; index < localExtIndex; index++)
 466                                 newName[newIndex++] = ext[index];
 467                 }
 468         }
 469
 470         return newIndex;
 471 }