lib/tinyxml/tinyxmlparser.cpp

   1 /*
   2 www.sourceforge.net/projects/tinyxml
   3 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
   4
   5 This software is provided 'as-is', without any express or implied
   6 warranty. In no event will the authors be held liable for any
   7 damages arising from the use of this software.
   8
   9 Permission is granted to anyone to use this software for any
  10 purpose, including commercial applications, and to alter it and
  11 redistribute it freely, subject to the following restrictions:
  12
  13 1. The origin of this software must not be misrepresented; you must
  14 not claim that you wrote the original software. If you use this
  15 software in a product, an acknowledgment in the product documentation
  16 would be appreciated but is not required.
  17
  18 2. Altered source versions must be plainly marked as such, and
  19 must not be misrepresented as being the original software.
  20
  21 3. This notice may not be removed or altered from any source
  22 distribution.
  23 */
  24
  25 #include <ctype.h>
  26 #include <stddef.h>
  27
  28 #include "tinyxml.h"
  29
  30 //#define DEBUG_PARSER
  31 #if defined( DEBUG_PARSER )
  32 #       if defined( DEBUG ) && defined( _MSC_VER )
  33 #               include <windows.h>
  34 #               define TIXML_LOG OutputDebugString
  35 #       else
  36 #               define TIXML_LOG printf
  37 #       endif
  38 #endif
  39
  40 // Note tha "PutString" hardcodes the same list. This
  41 // is less flexible than it appears. Changing the entries
  42 // or order will break putstring.
  43 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
  44 {
  45         { "&amp;",  5, '&' },
  46         { "&lt;",   4, '<' },
  47         { "&gt;",   4, '>' },
  48         { "&quot;", 6, '\"' },
  49         { "&apos;", 6, '\'' }
  50 };
  51
  52 // Bunch of unicode info at:
  53 //              http://www.unicode.org/faq/utf_bom.html
  54 // Including the basic of this table, which determines the #bytes in the
  55 // sequence from the lead byte. 1 placed for invalid sequences --
  56 // although the result will be junk, pass it through as much as possible.
  57 // Beware of the non-characters in UTF-8:
  58 //                              ef bb bf (Microsoft "lead bytes")
  59 //                              ef bf be
  60 //                              ef bf bf
  61
  62 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
  63 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
  64 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
  65
  66 const int TiXmlBase::utf8ByteTable[256] =
  67 {
  68         //      0       1       2       3       4       5       6       7       8       9       a       b       c       d       e       f
  69                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x00
  70                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x10
  71                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x20
  72                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x30
  73                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x40
  74                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x50
  75                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x60
  76                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x70 End of ASCII range
  77                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x80 0x80 to 0xc1 invalid
  78                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0x90
  79                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0xa0
  80                 1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      // 0xb0
  81                 1,      1,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      // 0xc0 0xc2 to 0xdf 2 byte
  82                 2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      2,      // 0xd0
  83                 3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      3,      // 0xe0 0xe0 to 0xef 3 byte
  84                 4,      4,      4,      4,      4,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1,      1       // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
  85 };
  86
  87
  88 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
  89 {
  90         const unsigned long BYTE_MASK = 0xBF;
  91         const unsigned long BYTE_MARK = 0x80;
  92         const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  93
  94         if (input < 0x80)
  95                 *length = 1;
  96         else if ( input < 0x800 )
  97                 *length = 2;
  98         else if ( input < 0x10000 )
  99                 *length = 3;
 100         else if ( input < 0x200000 )
 101                 *length = 4;
 102         else
 103                 { *length = 0; return; }        // This code won't covert this correctly anyway.
 104
 105         output += *length;
 106
 107         // Scary scary fall throughs.
 108         switch (*length)
 109         {
 110                 case 4:
 111                         --output;
 112                         *output = (char)((input | BYTE_MARK) & BYTE_MASK);
 113                         input >>= 6;
 114                 case 3:
 115                         --output;
 116                         *output = (char)((input | BYTE_MARK) & BYTE_MASK);
 117                         input >>= 6;
 118                 case 2:
 119                         --output;
 120                         *output = (char)((input | BYTE_MARK) & BYTE_MASK);
 121                         input >>= 6;
 122                 case 1:
 123                         --output;
 124                         *output = (char)(input | FIRST_BYTE_MARK[*length]);
 125         }
 126 }
 127
 128
 129 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
 130 {
 131         // This will only work for low-ascii, everything else is assumed to be a valid
 132         // letter. I'm not sure this is the best approach, but it is quite tricky trying
 133         // to figure out alhabetical vs. not across encoding. So take a very
 134         // conservative approach.
 135
 136 //      if ( encoding == TIXML_ENCODING_UTF8 )
 137 //      {
 138                 if ( anyByte < 127 )
 139                         return isalpha( anyByte );
 140                 else
 141                         return 1;       // What else to do? The unicode set is huge...get the english ones right.
 142 //      }
 143 //      else
 144 //      {
 145 //              return isalpha( anyByte );
 146 //      }
 147 }
 148
 149
 150 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
 151 {
 152         // This will only work for low-ascii, everything else is assumed to be a valid
 153         // letter. I'm not sure this is the best approach, but it is quite tricky trying
 154         // to figure out alhabetical vs. not across encoding. So take a very
 155         // conservative approach.
 156
 157 //      if ( encoding == TIXML_ENCODING_UTF8 )
 158 //      {
 159                 if ( anyByte < 127 )
 160                         return isalnum( anyByte );
 161                 else
 162                         return 1;       // What else to do? The unicode set is huge...get the english ones right.
 163 //      }
 164 //      else
 165 //      {
 166 //              return isalnum( anyByte );
 167 //      }
 168 }
 169
 170
 171 class TiXmlParsingData
 172 {
 173         friend class TiXmlDocument;
 174   public:
 175         void Stamp( const char* now, TiXmlEncoding encoding );
 176
 177         const TiXmlCursor& Cursor()     { return cursor; }
 178
 179   private:
 180         // Only used by the document!
 181         TiXmlParsingData( const char* start, int _tabsize, int row, int col )
 182         {
 183                 assert( start );
 184                 stamp = start;
 185                 tabsize = _tabsize;
 186                 cursor.row = row;
 187                 cursor.col = col;
 188         }
 189
 190         TiXmlCursor             cursor;
 191         const char*             stamp;
 192         int                             tabsize;
 193 };
 194
 195
 196 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
 197 {
 198         assert( now );
 199
 200         // Do nothing if the tabsize is 0.
 201         if ( tabsize < 1 )
 202         {
 203                 return;
 204         }
 205
 206         // Get the current row, column.
 207         int row = cursor.row;
 208         int col = cursor.col;
 209         const char* p = stamp;
 210         assert( p );
 211
 212         while ( p < now )
 213         {
 214                 // Treat p as unsigned, so we have a happy compiler.
 215                 const unsigned char* pU = (const unsigned char*)p;
 216
 217                 // Code contributed by Fletcher Dunn: (modified by lee)
 218                 switch (*pU) {
 219                         case 0:
 220                                 // We *should* never get here, but in case we do, don't
 221                                 // advance past the terminating null character, ever
 222                                 return;
 223
 224                         case '\r':
 225                                 // bump down to the next line
 226                                 ++row;
 227                                 col = 0;
 228                                 // Eat the character
 229                                 ++p;
 230
 231                                 // Check for \r\n sequence, and treat this as a single character
 232                                 if (*p == '\n') {
 233                                         ++p;
 234                                 }
 235                                 break;
 236
 237                         case '\n':
 238                                 // bump down to the next line
 239                                 ++row;
 240                                 col = 0;
 241
 242                                 // Eat the character
 243                                 ++p;
 244
 245                                 // Check for \n\r sequence, and treat this as a single
 246                                 // character.  (Yes, this bizarre thing does occur still
 247                                 // on some arcane platforms...)
 248                                 if (*p == '\r') {
 249                                         ++p;
 250                                 }
 251                                 break;
 252
 253                         case '\t':
 254                                 // Eat the character
 255                                 ++p;
 256
 257                                 // Skip to next tab stop
 258                                 col = (col / tabsize + 1) * tabsize;
 259                                 break;
 260
 261                         case TIXML_UTF_LEAD_0:
 262                                 if ( encoding == TIXML_ENCODING_UTF8 )
 263                                 {
 264                                         if ( *(p+1) && *(p+2) )
 265                                         {
 266                                                 // In these cases, don't advance the column. These are
 267                                                 // 0-width spaces.
 268                                                 if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
 269                                                         p += 3;
 270                                                 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
 271                                                         p += 3;
 272                                                 else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
 273                                                         p += 3;
 274                                                 else
 275                                                         { p +=3; ++col; }       // A normal character.
 276                                         }
 277                                 }
 278                                 else
 279                                 {
 280                                         ++p;
 281                                         ++col;
 282                                 }
 283                                 break;
 284
 285                         default:
 286                                 if ( encoding == TIXML_ENCODING_UTF8 )
 287                                 {
 288                                         // Eat the 1 to 4 byte utf8 character.
 289                                         int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
 290                                         if ( step == 0 )
 291                                                 step = 1;               // Error case from bad encoding, but handle gracefully.
 292                                         p += step;
 293
 294                                         // Just advance one column, of course.
 295                                         ++col;
 296                                 }
 297                                 else
 298                                 {
 299                                         ++p;
 300                                         ++col;
 301                                 }
 302                                 break;
 303                 }
 304         }
 305         cursor.row = row;
 306         cursor.col = col;
 307         assert( cursor.row >= -1 );
 308         assert( cursor.col >= -1 );
 309         stamp = p;
 310         assert( stamp );
 311 }
 312
 313
 314 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
 315 {
 316         if ( !p || !*p )
 317         {
 318                 return 0;
 319         }
 320         if ( encoding == TIXML_ENCODING_UTF8 )
 321         {
 322                 while ( *p )
 323                 {
 324                         const unsigned char* pU = (const unsigned char*)p;
 325
 326                         // Skip the stupid Microsoft UTF-8 Byte order marks
 327                         if (    *(pU+0)==TIXML_UTF_LEAD_0
 328                                  && *(pU+1)==TIXML_UTF_LEAD_1
 329                                  && *(pU+2)==TIXML_UTF_LEAD_2 )
 330                         {
 331                                 p += 3;
 332                                 continue;
 333                         }
 334                         else if(*(pU+0)==TIXML_UTF_LEAD_0
 335                                  && *(pU+1)==0xbfU
 336                                  && *(pU+2)==0xbeU )
 337                         {
 338                                 p += 3;
 339                                 continue;
 340                         }
 341                         else if(*(pU+0)==TIXML_UTF_LEAD_0
 342                                  && *(pU+1)==0xbfU
 343                                  && *(pU+2)==0xbfU )
 344                         {
 345                                 p += 3;
 346                                 continue;
 347                         }
 348
 349                         if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )            // Still using old rules for white space.
 350                                 ++p;
 351                         else
 352                                 break;
 353                 }
 354         }
 355         else
 356         {
 357                 while ( *p && IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )
 358                         ++p;
 359         }
 360
 361         return p;
 362 }
 363
 364 #ifdef TIXML_USE_STL
 365 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
 366 {
 367         for( ;; )
 368         {
 369                 if ( !in->good() ) return false;
 370
 371                 int c = in->peek();
 372                 // At this scope, we can't get to a document. So fail silently.
 373                 if ( !IsWhiteSpace( c ) || c <= 0 )
 374                         return true;
 375
 376                 *tag += (char) in->get();
 377         }
 378 }
 379
 380 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
 381 {
 382         //assert( character > 0 && character < 128 );   // else it won't work in utf-8
 383         while ( in->good() )
 384         {
 385                 int c = in->peek();
 386                 if ( c == character )
 387                         return true;
 388                 if ( c <= 0 )           // Silent failure: can't get document at this scope
 389                         return false;
 390
 391                 in->get();
 392                 *tag += (char) c;
 393         }
 394         return false;
 395 }
 396 #endif
 397
 398 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
 399 // "assign" optimization removes over 10% of the execution time.
 400 //
 401 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
 402 {
 403         // Oddly, not supported on some comilers,
 404         //name->clear();
 405         // So use this:
 406         *name = "";
 407         assert( p );
 408
 409         // Names start with letters or underscores.
 410         // Of course, in unicode, tinyxml has no idea what a letter *is*. The
 411         // algorithm is generous.
 412         //
 413         // After that, they can be letters, underscores, numbers,
 414         // hyphens, or colons. (Colons are valid ony for namespaces,
 415         // but tinyxml can't tell namespaces from names.)
 416         if (    p && *p
 417                  && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
 418         {
 419                 const char* start = p;
 420                 while(          p && *p
 421                                 &&      (               IsAlphaNum( (unsigned char ) *p, encoding )
 422                                                  || *p == '_'
 423                                                  || *p == '-'
 424                                                  || *p == '.'
 425                                                  || *p == ':' ) )
 426                 {
 427                         //(*name) += *p; // expensive
 428                         ++p;
 429                 }
 430                 if ( p-start > 0 ) {
 431                         name->assign( start, p-start );
 432                 }
 433                 return p;
 434         }
 435         return 0;
 436 }
 437
 438 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
 439 {
 440         // Presume an entity, and pull it out.
 441     TIXML_STRING ent;
 442         int i;
 443         *length = 0;
 444
 445         if ( *(p+1) && *(p+1) == '#' && *(p+2) )
 446         {
 447                 unsigned long ucs = 0;
 448                 ptrdiff_t delta = 0;
 449                 unsigned mult = 1;
 450
 451                 if ( *(p+2) == 'x' )
 452                 {
 453                         // Hexadecimal.
 454                         if ( !*(p+3) ) return 0;
 455
 456                         const char* q = p+3;
 457                         q = strchr( q, ';' );
 458
 459                         if ( !q || !*q ) return 0;
 460
 461                         delta = q-p;
 462                         --q;
 463
 464                         while ( *q != 'x' )
 465                         {
 466                                 if ( *q >= '0' && *q <= '9' )
 467                                         ucs += mult * (*q - '0');
 468                                 else if ( *q >= 'a' && *q <= 'f' )
 469                                         ucs += mult * (*q - 'a' + 10);
 470                                 else if ( *q >= 'A' && *q <= 'F' )
 471                                         ucs += mult * (*q - 'A' + 10 );
 472                                 else
 473                                         return 0;
 474                                 mult *= 16;
 475                                 --q;
 476                         }
 477                 }
 478                 else
 479                 {
 480                         // Decimal.
 481                         if ( !*(p+2) ) return 0;
 482
 483                         const char* q = p+2;
 484                         q = strchr( q, ';' );
 485
 486                         if ( !q || !*q ) return 0;
 487
 488                         delta = q-p;
 489                         --q;
 490
 491                         while ( *q != '#' )
 492                         {
 493                                 if ( *q >= '0' && *q <= '9' )
 494                                         ucs += mult * (*q - '0');
 495                                 else
 496                                         return 0;
 497                                 mult *= 10;
 498                                 --q;
 499                         }
 500                 }
 501                 if ( encoding == TIXML_ENCODING_UTF8 )
 502                 {
 503                         // convert the UCS to UTF-8
 504                         ConvertUTF32ToUTF8( ucs, value, length );
 505                 }
 506                 else
 507                 {
 508                         *value = (char)ucs;
 509                         *length = 1;
 510                 }
 511                 return p + delta + 1;
 512         }
 513
 514         // Now try to match it.
 515         for( i=0; i<NUM_ENTITY; ++i )
 516         {
 517                 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
 518                 {
 519                         assert( strlen( entity[i].str ) == entity[i].strLength );
 520                         *value = entity[i].chr;
 521                         *length = 1;
 522                         return ( p + entity[i].strLength );
 523                 }
 524         }
 525
 526         // So it wasn't an entity, its unrecognized, or something like that.
 527         *value = *p;    // Don't put back the last one, since we return it!
 528         //*length = 1;  // Leave unrecognized entities - this doesn't really work.
 529                                         // Just writes strange XML.
 530         return p+1;
 531 }
 532
 533
 534 bool TiXmlBase::StringEqual( const char* p,
 535                                                          const char* tag,
 536                                                          bool ignoreCase,
 537                                                          TiXmlEncoding encoding )
 538 {
 539         assert( p );
 540         assert( tag );
 541         if ( !p || !*p )
 542         {
 543                 assert( 0 );
 544                 return false;
 545         }
 546
 547         const char* q = p;
 548
 549         if ( ignoreCase )
 550         {
 551                 while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
 552                 {
 553                         ++q;
 554                         ++tag;
 555                 }
 556
 557                 if ( *tag == 0 )
 558                         return true;
 559         }
 560         else
 561         {
 562                 while ( *q && *tag && *q == *tag )
 563                 {
 564                         ++q;
 565                         ++tag;
 566                 }
 567
 568                 if ( *tag == 0 )                // Have we found the end of the tag, and everything equal?
 569                         return true;
 570         }
 571         return false;
 572 }
 573
 574 const char* TiXmlBase::ReadText(        const char* p,
 575                                                                         TIXML_STRING * text,
 576                                                                         bool trimWhiteSpace,
 577                                                                         const char* endTag,
 578                                                                         bool caseInsensitive,
 579                                                                         TiXmlEncoding encoding )
 580 {
 581     *text = "";
 582         if (    !trimWhiteSpace                 // certain tags always keep whitespace
 583                  || !condenseWhiteSpace )       // if true, whitespace is always kept
 584         {
 585                 // Keep all the white space.
 586                 while (    p && *p
 587                                 && !StringEqual( p, endTag, caseInsensitive, encoding )
 588                           )
 589                 {
 590                         int len;
 591                         char cArr[4] = { 0, 0, 0, 0 };
 592                         p = GetChar( p, cArr, &len, encoding );
 593                         text->append( cArr, len );
 594                 }
 595         }
 596         else
 597         {
 598                 bool whitespace = false;
 599
 600                 // Remove leading white space:
 601                 p = SkipWhiteSpace( p, encoding );
 602                 while (    p && *p
 603                                 && !StringEqual( p, endTag, caseInsensitive, encoding ) )
 604                 {
 605                         if ( *p == '\r' || *p == '\n' )
 606                         {
 607                                 whitespace = true;
 608                                 ++p;
 609                         }
 610                         else if ( IsWhiteSpace( *p ) )
 611                         {
 612                                 whitespace = true;
 613                                 ++p;
 614                         }
 615                         else
 616                         {
 617                                 // If we've found whitespace, add it before the
 618                                 // new character. Any whitespace just becomes a space.
 619                                 if ( whitespace )
 620                                 {
 621                                         (*text) += ' ';
 622                                         whitespace = false;
 623                                 }
 624                                 int len;
 625                                 char cArr[4] = { 0, 0, 0, 0 };
 626                                 p = GetChar( p, cArr, &len, encoding );
 627                                 if ( len == 1 )
 628                                         (*text) += cArr[0];     // more efficient
 629                                 else
 630                                         text->append( cArr, len );
 631                         }
 632                 }
 633         }
 634         if ( p )
 635                 p += strlen( endTag );
 636         return p;
 637 }
 638
 639 #ifdef TIXML_USE_STL
 640
 641 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
 642 {
 643         // The basic issue with a document is that we don't know what we're
 644         // streaming. Read something presumed to be a tag (and hope), then
 645         // identify it, and call the appropriate stream method on the tag.
 646         //
 647         // This "pre-streaming" will never read the closing ">" so the
 648         // sub-tag can orient itself.
 649
 650         if ( !StreamTo( in, '<', tag ) )
 651         {
 652                 SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
 653                 return;
 654         }
 655
 656         while ( in->good() )
 657         {
 658                 int tagIndex = (int) tag->length();
 659                 while ( in->good() && in->peek() != '>' )
 660                 {
 661                         int c = in->get();
 662                         if ( c <= 0 )
 663                         {
 664                                 SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
 665                                 break;
 666                         }
 667                         (*tag) += (char) c;
 668                 }
 669
 670                 if ( in->good() )
 671                 {
 672                         // We now have something we presume to be a node of
 673                         // some sort. Identify it, and call the node to
 674                         // continue streaming.
 675                         TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
 676
 677                         if ( node )
 678                         {
 679                                 node->StreamIn( in, tag );
 680                                 bool isElement = node->ToElement() != 0;
 681                                 delete node;
 682                                 node = 0;
 683
 684                                 // If this is the root element, we're done. Parsing will be
 685                                 // done by the >> operator.
 686                                 if ( isElement )
 687                                 {
 688                                         return;
 689                                 }
 690                         }
 691                         else
 692                         {
 693                                 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
 694                                 return;
 695                         }
 696                 }
 697         }
 698         // We should have returned sooner.
 699         SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
 700 }
 701
 702 #endif
 703
 704 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
 705 {
 706         ClearError();
 707
 708         // Parse away, at the document level. Since a document
 709         // contains nothing but other tags, most of what happens
 710         // here is skipping white space.
 711         if ( !p || !*p )
 712         {
 713                 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
 714                 return 0;
 715         }
 716
 717         // Note that, for a document, this needs to come
 718         // before the while space skip, so that parsing
 719         // starts from the pointer we are given.
 720         location.Clear();
 721         if ( prevData )
 722         {
 723                 location.row = prevData->cursor.row;
 724                 location.col = prevData->cursor.col;
 725         }
 726         else
 727         {
 728                 location.row = 0;
 729                 location.col = 0;
 730         }
 731         TiXmlParsingData data( p, TabSize(), location.row, location.col );
 732         location = data.Cursor();
 733
 734         if ( encoding == TIXML_ENCODING_UNKNOWN )
 735         {
 736                 // Check for the Microsoft UTF-8 lead bytes.
 737                 const unsigned char* pU = (const unsigned char*)p;
 738                 if (    *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
 739                          && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
 740                          && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
 741                 {
 742                         encoding = TIXML_ENCODING_UTF8;
 743                         useMicrosoftBOM = true;
 744                 }
 745         }
 746
 747     p = SkipWhiteSpace( p, encoding );
 748         if ( !p )
 749         {
 750                 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
 751                 return 0;
 752         }
 753
 754         while ( p && *p )
 755         {
 756                 TiXmlNode* node = Identify( p, encoding );
 757                 if ( node )
 758                 {
 759                         p = node->Parse( p, &data, encoding );
 760                         LinkEndChild( node );
 761                 }
 762                 else
 763                 {
 764                         break;
 765                 }
 766
 767                 // Did we get encoding info?
 768                 if (    encoding == TIXML_ENCODING_UNKNOWN
 769                          && node->ToDeclaration() )
 770                 {
 771                         TiXmlDeclaration* dec = node->ToDeclaration();
 772                         const char* enc = dec->Encoding();
 773                         assert( enc );
 774
 775                         if ( *enc == 0 )
 776                                 encoding = TIXML_ENCODING_UTF8;
 777                         else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
 778                                 encoding = TIXML_ENCODING_UTF8;
 779                         else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
 780                                 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
 781                         else
 782                                 encoding = TIXML_ENCODING_LEGACY;
 783                 }
 784
 785                 p = SkipWhiteSpace( p, encoding );
 786         }
 787
 788         // Was this empty?
 789         if ( !firstChild ) {
 790                 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
 791                 return 0;
 792         }
 793
 794         // All is well.
 795         return p;
 796 }
 797
 798 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
 799 {
 800         // The first error in a chain is more accurate - don't set again!
 801         if ( error )
 802                 return;
 803
 804         assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
 805         error   = true;
 806         errorId = err;
 807         errorDesc = errorString[ errorId ];
 808
 809         errorLocation.Clear();
 810         if ( pError && data )
 811         {
 812                 data->Stamp( pError, encoding );
 813                 errorLocation = data->Cursor();
 814         }
 815 }
 816
 817
 818 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
 819 {
 820         TiXmlNode* returnNode = 0;
 821
 822         p = SkipWhiteSpace( p, encoding );
 823         if( !p || !*p || *p != '<' )
 824         {
 825                 return 0;
 826         }
 827
 828         TiXmlDocument* doc = GetDocument();
 829         p = SkipWhiteSpace( p, encoding );
 830
 831         if ( !p || !*p )
 832         {
 833                 return 0;
 834         }
 835
 836         // What is this thing?
 837         // - Elements start with a letter or underscore, but xml is reserved.
 838         // - Comments: <!--
 839         // - Decleration: <?xml
 840         // - Everthing else is unknown to tinyxml.
 841         //
 842
 843         const char* xmlHeader = { "<?xml" };
 844         const char* commentHeader = { "<!--" };
 845         const char* dtdHeader = { "<!" };
 846         const char* cdataHeader = { "<![CDATA[" };
 847
 848         if ( StringEqual( p, xmlHeader, true, encoding ) )
 849         {
 850                 #ifdef DEBUG_PARSER
 851                         TIXML_LOG( "XML parsing Declaration\n" );
 852                 #endif
 853                 returnNode = new TiXmlDeclaration();
 854         }
 855         else if ( StringEqual( p, commentHeader, false, encoding ) )
 856         {
 857                 #ifdef DEBUG_PARSER
 858                         TIXML_LOG( "XML parsing Comment\n" );
 859                 #endif
 860                 returnNode = new TiXmlComment();
 861         }
 862         else if ( StringEqual( p, cdataHeader, false, encoding ) )
 863         {
 864                 #ifdef DEBUG_PARSER
 865                         TIXML_LOG( "XML parsing CDATA\n" );
 866                 #endif
 867                 TiXmlText* text = new TiXmlText( "" );
 868                 text->SetCDATA( true );
 869                 returnNode = text;
 870         }
 871         else if ( StringEqual( p, dtdHeader, false, encoding ) )
 872         {
 873                 #ifdef DEBUG_PARSER
 874                         TIXML_LOG( "XML parsing Unknown(1)\n" );
 875                 #endif
 876                 returnNode = new TiXmlUnknown();
 877         }
 878         else if (    IsAlpha( *(p+1), encoding )
 879                           || *(p+1) == '_' )
 880         {
 881                 #ifdef DEBUG_PARSER
 882                         TIXML_LOG( "XML parsing Element\n" );
 883                 #endif
 884                 returnNode = new TiXmlElement( "" );
 885         }
 886         else
 887         {
 888                 #ifdef DEBUG_PARSER
 889                         TIXML_LOG( "XML parsing Unknown(2)\n" );
 890                 #endif
 891                 returnNode = new TiXmlUnknown();
 892         }
 893
 894         if ( returnNode )
 895         {
 896                 // Set the parent, so it can report errors
 897                 returnNode->parent = this;
 898         }
 899         else
 900         {
 901                 if ( doc )
 902                         doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
 903         }
 904         return returnNode;
 905 }
 906
 907 #ifdef TIXML_USE_STL
 908
 909 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
 910 {
 911         // We're called with some amount of pre-parsing. That is, some of "this"
 912         // element is in "tag". Go ahead and stream to the closing ">"
 913         while( in->good() )
 914         {
 915                 int c = in->get();
 916                 if ( c <= 0 )
 917                 {
 918                         TiXmlDocument* document = GetDocument();
 919                         if ( document )
 920                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
 921                         return;
 922                 }
 923                 (*tag) += (char) c ;
 924
 925                 if ( c == '>' )
 926                         break;
 927         }
 928
 929         if ( tag->length() < 3 ) return;
 930
 931         // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
 932         // If not, identify and stream.
 933
 934         if (    tag->at( tag->length() - 1 ) == '>'
 935                  && tag->at( tag->length() - 2 ) == '/' )
 936         {
 937                 // All good!
 938                 return;
 939         }
 940         else if ( tag->at( tag->length() - 1 ) == '>' )
 941         {
 942                 // There is more. Could be:
 943                 //              text
 944                 //              cdata text (which looks like another node)
 945                 //              closing tag
 946                 //              another node.
 947                 for ( ;; )
 948                 {
 949                         StreamWhiteSpace( in, tag );
 950
 951                         // Do we have text?
 952                         if ( in->good() && in->peek() != '<' )
 953                         {
 954                                 // Yep, text.
 955                                 TiXmlText text( "" );
 956                                 text.StreamIn( in, tag );
 957
 958                                 // What follows text is a closing tag or another node.
 959                                 // Go around again and figure it out.
 960                                 continue;
 961                         }
 962
 963                         // We now have either a closing tag...or another node.
 964                         // We should be at a "<", regardless.
 965                         if ( !in->good() ) return;
 966                         assert( in->peek() == '<' );
 967                         int tagIndex = (int) tag->length();
 968
 969                         bool closingTag = false;
 970                         bool firstCharFound = false;
 971
 972                         for( ;; )
 973                         {
 974                                 if ( !in->good() )
 975                                         return;
 976
 977                                 int c = in->peek();
 978                                 if ( c <= 0 )
 979                                 {
 980                                         TiXmlDocument* document = GetDocument();
 981                                         if ( document )
 982                                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
 983                                         return;
 984                                 }
 985
 986                                 if ( c == '>' )
 987                                         break;
 988
 989                                 *tag += (char) c;
 990                                 in->get();
 991
 992                                 // Early out if we find the CDATA id.
 993                                 if ( c == '[' && tag->size() >= 9 )
 994                                 {
 995                                         size_t len = tag->size();
 996                                         const char* start = tag->c_str() + len - 9;
 997                                         if ( strcmp( start, "<![CDATA[" ) == 0 ) {
 998                                                 assert( !closingTag );
 999                                                 break;
1000                                         }
1001                                 }
1002
1003                                 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
1004                                 {
1005                                         firstCharFound = true;
1006                                         if ( c == '/' )
1007                                                 closingTag = true;
1008                                 }
1009                         }
1010                         // If it was a closing tag, then read in the closing '>' to clean up the input stream.
1011                         // If it was not, the streaming will be done by the tag.
1012                         if ( closingTag )
1013                         {
1014                                 if ( !in->good() )
1015                                         return;
1016
1017                                 int c = in->get();
1018                                 if ( c <= 0 )
1019                                 {
1020                                         TiXmlDocument* document = GetDocument();
1021                                         if ( document )
1022                                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1023                                         return;
1024                                 }
1025                                 assert( c == '>' );
1026                                 *tag += (char) c;
1027
1028                                 // We are done, once we've found our closing tag.
1029                                 return;
1030                         }
1031                         else
1032                         {
1033                                 // If not a closing tag, id it, and stream.
1034                                 const char* tagloc = tag->c_str() + tagIndex;
1035                                 TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
1036                                 if ( !node )
1037                                         return;
1038                                 node->StreamIn( in, tag );
1039                                 delete node;
1040                                 node = 0;
1041
1042                                 // No return: go around from the beginning: text, closing tag, or node.
1043                         }
1044                 }
1045         }
1046 }
1047 #endif
1048
1049 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1050 {
1051         p = SkipWhiteSpace( p, encoding );
1052         TiXmlDocument* document = GetDocument();
1053
1054         if ( !p || !*p )
1055         {
1056                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1057                 return 0;
1058         }
1059
1060         if ( data )
1061         {
1062                 data->Stamp( p, encoding );
1063                 location = data->Cursor();
1064         }
1065
1066         if ( *p != '<' )
1067         {
1068                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1069                 return 0;
1070         }
1071
1072         p = SkipWhiteSpace( p+1, encoding );
1073
1074         // Read the name.
1075         const char* pErr = p;
1076
1077     p = ReadName( p, &value, encoding );
1078         if ( !p || !*p )
1079         {
1080                 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1081                 return 0;
1082         }
1083
1084     TIXML_STRING endTag ("</");
1085         endTag += value;
1086         endTag += ">";
1087
1088         // Check for and read attributes. Also look for an empty
1089         // tag or an end tag.
1090         while ( p && *p )
1091         {
1092                 pErr = p;
1093                 p = SkipWhiteSpace( p, encoding );
1094                 if ( !p || !*p )
1095                 {
1096                         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1097                         return 0;
1098                 }
1099                 if ( *p == '/' )
1100                 {
1101                         ++p;
1102                         // Empty tag.
1103                         if ( *p  != '>' )
1104                         {
1105                                 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1106                                 return 0;
1107                         }
1108                         return (p+1);
1109                 }
1110                 else if ( *p == '>' )
1111                 {
1112                         // Done with attributes (if there were any.)
1113                         // Read the value -- which can include other
1114                         // elements -- read the end tag, and return.
1115                         ++p;
1116                         p = ReadValue( p, data, encoding );             // Note this is an Element method, and will set the error if one happens.
1117                         if ( !p || !*p ) {
1118                                 // We were looking for the end tag, but found nothing.
1119                                 // Fix for [ 1663758 ] Failure to report error on bad XML
1120                                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1121                                 return 0;
1122                         }
1123
1124                         // We should find the end tag now
1125                         if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1126                         {
1127                                 p += endTag.length();
1128                                 return p;
1129                         }
1130                         else
1131                         {
1132                                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1133                                 return 0;
1134                         }
1135                 }
1136                 else
1137                 {
1138                         // Try to read an attribute:
1139                         TiXmlAttribute* attrib = new TiXmlAttribute();
1140                         if ( !attrib )
1141                         {
1142                                 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1143                                 return 0;
1144                         }
1145
1146                         attrib->SetDocument( document );
1147                         pErr = p;
1148                         p = attrib->Parse( p, data, encoding );
1149
1150                         if ( !p || !*p )
1151                         {
1152                                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1153                                 delete attrib;
1154                                 return 0;
1155                         }
1156
1157                         // Handle the strange case of double attributes:
1158                         #ifdef TIXML_USE_STL
1159                         TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
1160                         #else
1161                         TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1162                         #endif
1163                         if ( node )
1164                         {
1165                                 node->SetValue( attrib->Value() );
1166                                 delete attrib;
1167                                 return 0;
1168                         }
1169
1170                         attributeSet.Add( attrib );
1171                 }
1172         }
1173         return p;
1174 }
1175
1176
1177 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1178 {
1179         TiXmlDocument* document = GetDocument();
1180
1181         // Read in text and elements in any order.
1182         const char* pWithWhiteSpace = p;
1183         p = SkipWhiteSpace( p, encoding );
1184
1185         while ( p && *p )
1186         {
1187                 if ( *p != '<' )
1188                 {
1189                         // Take what we have, make a text element.
1190                         TiXmlText* textNode = new TiXmlText( "" );
1191
1192                         if ( !textNode )
1193                         {
1194                                 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1195                                     return 0;
1196                         }
1197
1198                         if ( TiXmlBase::IsWhiteSpaceCondensed() )
1199                         {
1200                                 p = textNode->Parse( p, data, encoding );
1201                         }
1202                         else
1203                         {
1204                                 // Special case: we want to keep the white space
1205                                 // so that leading spaces aren't removed.
1206                                 p = textNode->Parse( pWithWhiteSpace, data, encoding );
1207                         }
1208
1209                         if ( !textNode->Blank() )
1210                                 LinkEndChild( textNode );
1211                         else
1212                                 delete textNode;
1213                 }
1214                 else
1215                 {
1216                         // We hit a '<'
1217                         // Have we hit a new element or an end tag? This could also be
1218                         // a TiXmlText in the "CDATA" style.
1219                         if ( StringEqual( p, "</", false, encoding ) )
1220                         {
1221                                 return p;
1222                         }
1223                         else
1224                         {
1225                                 TiXmlNode* node = Identify( p, encoding );
1226                                 if ( node )
1227                                 {
1228                                         p = node->Parse( p, data, encoding );
1229                                         LinkEndChild( node );
1230                                 }
1231                                 else
1232                                 {
1233                                         return 0;
1234                                 }
1235                         }
1236                 }
1237                 pWithWhiteSpace = p;
1238                 p = SkipWhiteSpace( p, encoding );
1239         }
1240
1241         if ( !p )
1242         {
1243                 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1244         }
1245         return p;
1246 }
1247
1248
1249 #ifdef TIXML_USE_STL
1250 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
1251 {
1252         while ( in->good() )
1253         {
1254                 int c = in->get();
1255                 if ( c <= 0 )
1256                 {
1257                         TiXmlDocument* document = GetDocument();
1258                         if ( document )
1259                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1260                         return;
1261                 }
1262                 (*tag) += (char) c;
1263
1264                 if ( c == '>' )
1265                 {
1266                         // All is well.
1267                         return;
1268                 }
1269         }
1270 }
1271 #endif
1272
1273
1274 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1275 {
1276         TiXmlDocument* document = GetDocument();
1277         p = SkipWhiteSpace( p, encoding );
1278
1279         if ( data )
1280         {
1281                 data->Stamp( p, encoding );
1282                 location = data->Cursor();
1283         }
1284         if ( !p || !*p || *p != '<' )
1285         {
1286                 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1287                 return 0;
1288         }
1289         ++p;
1290     value = "";
1291
1292         while ( p && *p && *p != '>' )
1293         {
1294                 value += *p;
1295                 ++p;
1296         }
1297
1298         if ( !p )
1299         {
1300                 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1301         }
1302         if ( *p == '>' )
1303                 return p+1;
1304         return p;
1305 }
1306
1307 #ifdef TIXML_USE_STL
1308 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
1309 {
1310         while ( in->good() )
1311         {
1312                 int c = in->get();
1313                 if ( c <= 0 )
1314                 {
1315                         TiXmlDocument* document = GetDocument();
1316                         if ( document )
1317                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1318                         return;
1319                 }
1320
1321                 (*tag) += (char) c;
1322
1323                 if ( c == '>'
1324                          && tag->at( tag->length() - 2 ) == '-'
1325                          && tag->at( tag->length() - 3 ) == '-' )
1326                 {
1327                         // All is well.
1328                         return;
1329                 }
1330         }
1331 }
1332 #endif
1333
1334
1335 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1336 {
1337         TiXmlDocument* document = GetDocument();
1338         value = "";
1339
1340         p = SkipWhiteSpace( p, encoding );
1341
1342         if ( data )
1343         {
1344                 data->Stamp( p, encoding );
1345                 location = data->Cursor();
1346         }
1347         const char* startTag = "<!--";
1348         const char* endTag   = "-->";
1349
1350         if ( !StringEqual( p, startTag, false, encoding ) )
1351         {
1352                 document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1353                 return 0;
1354         }
1355         p += strlen( startTag );
1356
1357         // [ 1475201 ] TinyXML parses entities in comments
1358         // Oops - ReadText doesn't work, because we don't want to parse the entities.
1359         // p = ReadText( p, &value, false, endTag, false, encoding );
1360         //
1361         // from the XML spec:
1362         /*
1363          [Definition: Comments may appear anywhere in a document outside other markup; in addition,
1364                       they may appear within the document type declaration at places allowed by the grammar.
1365                                   They are not part of the document's character data; an XML processor MAY, but need not,
1366                                   make it possible for an application to retrieve the text of comments. For compatibility,
1367                                   the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
1368                                   references MUST NOT be recognized within comments.
1369
1370                                   An example of a comment:
1371
1372                                   <!-- declarations for <head> & <body> -->
1373         */
1374
1375     value = "";
1376         // Keep all the white space.
1377         while ( p && *p && !StringEqual( p, endTag, false, encoding ) )
1378         {
1379                 value.append( p, 1 );
1380                 ++p;
1381         }
1382         if ( p )
1383                 p += strlen( endTag );
1384
1385         return p;
1386 }
1387
1388
1389 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1390 {
1391         p = SkipWhiteSpace( p, encoding );
1392         if ( !p || !*p ) return 0;
1393
1394 //      int tabsize = 4;
1395 //      if ( document )
1396 //              tabsize = document->TabSize();
1397
1398         if ( data )
1399         {
1400                 data->Stamp( p, encoding );
1401                 location = data->Cursor();
1402         }
1403         // Read the name, the '=' and the value.
1404         const char* pErr = p;
1405         p = ReadName( p, &name, encoding );
1406         if ( !p || !*p )
1407         {
1408                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1409                 return 0;
1410         }
1411         p = SkipWhiteSpace( p, encoding );
1412         if ( !p || !*p || *p != '=' )
1413         {
1414                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1415                 return 0;
1416         }
1417
1418         ++p;    // skip '='
1419         p = SkipWhiteSpace( p, encoding );
1420         if ( !p || !*p )
1421         {
1422                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1423                 return 0;
1424         }
1425
1426         const char* end;
1427         const char SINGLE_QUOTE = '\'';
1428         const char DOUBLE_QUOTE = '\"';
1429
1430         if ( *p == SINGLE_QUOTE )
1431         {
1432                 ++p;
1433                 end = "\'";             // single quote in string
1434                 p = ReadText( p, &value, false, end, false, encoding );
1435         }
1436         else if ( *p == DOUBLE_QUOTE )
1437         {
1438                 ++p;
1439                 end = "\"";             // double quote in string
1440                 p = ReadText( p, &value, false, end, false, encoding );
1441         }
1442         else
1443         {
1444                 // All attribute values should be in single or double quotes.
1445                 // But this is such a common error that the parser will try
1446                 // its best, even without them.
1447                 value = "";
1448                 while (    p && *p                                                                                      // existence
1449                                 && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r'      // whitespace
1450                                 && *p != '/' && *p != '>' )                                                     // tag end
1451                 {
1452                         if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
1453                                 // [ 1451649 ] Attribute values with trailing quotes not handled correctly
1454                                 // We did not have an opening quote but seem to have a
1455                                 // closing one. Give up and throw an error.
1456                                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1457                                 return 0;
1458                         }
1459                         value += *p;
1460                         ++p;
1461                 }
1462         }
1463         return p;
1464 }
1465
1466 #ifdef TIXML_USE_STL
1467 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
1468 {
1469         while ( in->good() )
1470         {
1471                 int c = in->peek();
1472                 if ( !cdata && (c == '<' ) )
1473                 {
1474                         return;
1475                 }
1476                 if ( c <= 0 )
1477                 {
1478                         TiXmlDocument* document = GetDocument();
1479                         if ( document )
1480                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1481                         return;
1482                 }
1483
1484                 (*tag) += (char) c;
1485                 in->get();      // "commits" the peek made above
1486
1487                 if ( cdata && c == '>' && tag->size() >= 3 ) {
1488                         size_t len = tag->size();
1489                         if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
1490                                 // terminator of cdata.
1491                                 return;
1492                         }
1493                 }
1494         }
1495 }
1496 #endif
1497
1498 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1499 {
1500         value = "";
1501         TiXmlDocument* document = GetDocument();
1502
1503         if ( data )
1504         {
1505                 data->Stamp( p, encoding );
1506                 location = data->Cursor();
1507         }
1508
1509         const char* const startTag = "<![CDATA[";
1510         const char* const endTag   = "]]>";
1511
1512         if ( cdata || StringEqual( p, startTag, false, encoding ) )
1513         {
1514                 cdata = true;
1515
1516                 if ( !StringEqual( p, startTag, false, encoding ) )
1517                 {
1518                         document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1519                         return 0;
1520                 }
1521                 p += strlen( startTag );
1522
1523                 // Keep all the white space, ignore the encoding, etc.
1524                 while (    p && *p
1525                                 && !StringEqual( p, endTag, false, encoding )
1526                           )
1527                 {
1528                         value += *p;
1529                         ++p;
1530                 }
1531
1532                 TIXML_STRING dummy;
1533                 p = ReadText( p, &dummy, false, endTag, false, encoding );
1534                 return p;
1535         }
1536         else
1537         {
1538                 bool ignoreWhite = true;
1539
1540                 const char* end = "<";
1541                 p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1542                 if ( p )
1543                         return p-1;     // don't truncate the '<'
1544                 return 0;
1545         }
1546 }
1547
1548 #ifdef TIXML_USE_STL
1549 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
1550 {
1551         while ( in->good() )
1552         {
1553                 int c = in->get();
1554                 if ( c <= 0 )
1555                 {
1556                         TiXmlDocument* document = GetDocument();
1557                         if ( document )
1558                                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1559                         return;
1560                 }
1561                 (*tag) += (char) c;
1562
1563                 if ( c == '>' )
1564                 {
1565                         // All is well.
1566                         return;
1567                 }
1568         }
1569 }
1570 #endif
1571
1572 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1573 {
1574         p = SkipWhiteSpace( p, _encoding );
1575         // Find the beginning, find the end, and look for
1576         // the stuff in-between.
1577         TiXmlDocument* document = GetDocument();
1578         if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1579         {
1580                 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1581                 return 0;
1582         }
1583         if ( data )
1584         {
1585                 data->Stamp( p, _encoding );
1586                 location = data->Cursor();
1587         }
1588         p += 5;
1589
1590         version = "";
1591         encoding = "";
1592         standalone = "";
1593
1594         while ( p && *p )
1595         {
1596                 if ( *p == '>' )
1597                 {
1598                         ++p;
1599                         return p;
1600                 }
1601
1602                 p = SkipWhiteSpace( p, _encoding );
1603                 if ( StringEqual( p, "version", true, _encoding ) )
1604                 {
1605                         TiXmlAttribute attrib;
1606                         p = attrib.Parse( p, data, _encoding );
1607                         version = attrib.Value();
1608                 }
1609                 else if ( StringEqual( p, "encoding", true, _encoding ) )
1610                 {
1611                         TiXmlAttribute attrib;
1612                         p = attrib.Parse( p, data, _encoding );
1613                         encoding = attrib.Value();
1614                 }
1615                 else if ( StringEqual( p, "standalone", true, _encoding ) )
1616                 {
1617                         TiXmlAttribute attrib;
1618                         p = attrib.Parse( p, data, _encoding );
1619                         standalone = attrib.Value();
1620                 }
1621                 else
1622                 {
1623                         // Read over whatever it is.
1624                         while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1625                                 ++p;
1626                 }
1627         }
1628         return 0;
1629 }
1630
1631 bool TiXmlText::Blank() const
1632 {
1633         for ( unsigned i=0; i<value.length(); i++ )
1634                 if ( !IsWhiteSpace( value[i] ) )
1635                         return false;
1636         return true;
1637 }
1638