// Markup.cpp: implementation of the CMarkup class. // // Markup Release 10.1 // Copyright (C) 2008 First Objective Software, Inc. All rights reserved // Go to www.firstobject.com for the latest CMarkup and EDOM documentation // Use in commercial applications requires written permission // This software is provided "as is", with no warranty. // #include #include "Markup.h" #if defined(MCD_STRERROR) // C error routine #include #endif // C error routine #if defined (MARKUP_ICONV) #include #endif #if defined(MARKUP_STL) && ( defined(MARKUP_WINCONV) || (! defined(MCD_STRERROR))) #include // for MultiByteToWideChar, WideCharToMultiByte, FormatMessage #endif // need windows.h when STL and (not setlocale or not strerror), MFC afx.h includes it already #if defined(MARKUP_MBCS) // MBCS/double byte #pragma message( "Note: MBCS build (not UTF-8)" ) // For UTF-8, remove MBCS from project settings C/C++ preprocessor definitions #if defined (MARKUP_WINCONV) #include // for VC++ _mbclen #endif // WINCONV #endif // MBCS/double byte #if defined(_DEBUG) && _MSC_VER > 1000 // VC++ DEBUG #undef THIS_FILE static char THIS_FILE[]=__FILE__; #if defined(DEBUG_NEW) #define new DEBUG_NEW #endif // DEBUG_NEW #endif // VC++ DEBUG // Customization #define x_EOL MCD_T("\r\n") // can be \r\n or \n or empty #define x_EOLLEN (sizeof(x_EOL)/sizeof(MCD_CHAR)-1) // string length of x_EOL #define x_ATTRIBQUOTE MCD_T("\"") // can be double or single quote // Disable "while ( 1 )" warning in VC++ 2002 #if _MSC_VER >= 1300 // VC++ 2002 (7.0) #pragma warning(disable:4127) #endif // VC++ 2002 (7.0) void CMarkup::operator=( const CMarkup& markup ) { m_iPosParent = markup.m_iPosParent; m_iPos = markup.m_iPos; m_iPosChild = markup.m_iPosChild; m_iPosFree = markup.m_iPosFree; m_iPosDeleted = markup.m_iPosDeleted; m_nNodeType = markup.m_nNodeType; m_nNodeOffset = markup.m_nNodeOffset; m_nNodeLength = markup.m_nNodeLength; m_strDoc = markup.m_strDoc; m_strError = markup.m_strError; m_nDocFlags = markup.m_nDocFlags; // Copy used part of the index array m_aPos.RemoveAll(); m_aPos.nSize = m_iPosFree; if ( m_aPos.nSize < 8 ) m_aPos.nSize = 8; m_aPos.nSegs = m_aPos.SegsUsed(); if ( m_aPos.nSegs ) { m_aPos.pSegs = (ElemPos**)(new char[m_aPos.nSegs*sizeof(char*)]); int nSegSize = 1 << m_aPos.PA_SEGBITS; for ( int nSeg=0; nSeg < m_aPos.nSegs; ++nSeg ) { if ( nSeg + 1 == m_aPos.nSegs ) nSegSize = m_aPos.GetSize() - (nSeg << m_aPos.PA_SEGBITS); m_aPos.pSegs[nSeg] = (ElemPos*)(new char[nSegSize*sizeof(ElemPos)]); memcpy( m_aPos.pSegs[nSeg], markup.m_aPos.pSegs[nSeg], nSegSize*sizeof(ElemPos) ); } } // Copy SavedPos map(s) m_SavedPosMapArray.RemoveAll(); if ( markup.m_SavedPosMapArray.pMaps ) { int nMap = 0; SavedPosMap* pMap = NULL; while ( markup.m_SavedPosMapArray.pMaps[nMap] ) { SavedPosMap* pMapSrc = markup.m_SavedPosMapArray.pMaps[nMap]; x_GetMap( pMap, nMap, pMapSrc->nMapSize ); for ( int nSlot=0; nSlot < pMap->nMapSize; ++nSlot ) { SavedPos* pCopySavedPos = pMapSrc->pTable[nSlot]; if ( pCopySavedPos ) { int nCount = 0; while ( pCopySavedPos[nCount].nSavedPosFlags & SavedPos::SPM_USED ) { ++nCount; if ( pCopySavedPos[nCount-1].nSavedPosFlags & SavedPos::SPM_LAST ) break; } if ( nCount ) { SavedPos* pNewSavedPos = new SavedPos[nCount]; for ( int nCopy=0; nCopypTable[nSlot] = pNewSavedPos; } } } ++nMap; } } MARKUP_SETDEBUGSTATE; } bool CMarkup::SetDoc( MCD_PCSZ pDoc ) { // Set document text if ( pDoc ) m_strDoc = pDoc; else MCD_STRCLEAR(m_strDoc); MCD_STRCLEAR(m_strError); return x_ParseDoc(); } bool CMarkup::SetDoc( const MCD_STR& strDoc ) { m_strDoc = strDoc; MCD_STRCLEAR(m_strError); return x_ParseDoc(); } bool CMarkup::IsWellFormed() { if ( m_aPos.GetSize() && ! (m_aPos[0].nFlags & MNF_ILLFORMED) && m_aPos[0].iElemChild && ! m_aPos[m_aPos[0].iElemChild].iElemNext ) return true; return false; } bool CMarkup::Load( MCD_CSTR_FILENAME szFileName ) { if ( ! ReadTextFile(szFileName, m_strDoc, &m_strError, &m_nDocFlags) ) return false; return x_ParseDoc(); } bool CMarkup::ReadTextFile( MCD_CSTR_FILENAME szFileName, MCD_STR& strDoc, MCD_STR* pstrError, int* pnDocFlags, MCD_STR* pstrEncoding ) { // Static utility method to load text file into strDoc // FilePos file; file.nDocFlags = (pnDocFlags?*pnDocFlags:0) | MDF_READFILE; bool bSuccess = x_Open( szFileName, file ); MCD_STR strCombinedIOResult = file.strIOResult; MCD_STRCLEAR(strDoc); if ( bSuccess ) { if ( pstrEncoding ) { MCD_STR strEncoding = *pstrEncoding; if ( file.nFileCharUnitSize == 1 && ! MCD_STRISEMPTY(strEncoding) && file.strEncoding != strEncoding ) { file.strEncoding = strEncoding; // override strCombinedIOResult += MCD_T("forced "); strCombinedIOResult += file.strEncoding + MCD_T(" "); } } file.nReadByteLen = file.nFileByteLen; bSuccess = x_ReadText( strDoc, file ); x_Close( file ); if ( MCD_STRISEMPTY(strCombinedIOResult) ) { if ( MCD_STRISEMPTY(file.strEncoding) ) strCombinedIOResult = MCD_T("ANSI "); else strCombinedIOResult = file.strEncoding + MCD_T(" "); } strCombinedIOResult += file.strIOResult; if ( pnDocFlags ) *pnDocFlags = file.nDocFlags; } if ( pstrError ) *pstrError = strCombinedIOResult; return bSuccess; } bool CMarkup::Save( MCD_CSTR_FILENAME szFileName ) { return WriteTextFile( szFileName, m_strDoc, &m_strError, &m_nDocFlags ); } bool CMarkup::WriteTextFile( MCD_CSTR_FILENAME szFileName, const MCD_STR& strDoc, MCD_STR* pstrError, int* pnDocFlags, MCD_STR* pstrEncoding ) { // Static utility method to save strDoc to text file // FilePos file; file.nDocFlags = (pnDocFlags?*pnDocFlags:0) | MDF_WRITEFILE; bool bSuccess = x_Open( szFileName, file ); MCD_STR strCombinedIOResult = file.strIOResult; if ( bSuccess ) { if ( MCD_STRISEMPTY(file.strEncoding) && ! MCD_STRISEMPTY(strDoc) ) { file.strEncoding = GetDeclaredEncoding( strDoc ); if ( MCD_STRISEMPTY(file.strEncoding) && strDoc[0] == '<' ) file.strEncoding = MCD_T("UTF-8"); if ( ! MCD_STRISEMPTY(file.strEncoding) ) strCombinedIOResult += file.strEncoding + MCD_T(" "); } if ( pstrEncoding ) { MCD_STR strEncoding = *pstrEncoding; if ( file.nFileCharUnitSize == 1 && ! MCD_STRISEMPTY(strEncoding) && file.strEncoding != strEncoding ) { file.strEncoding = strEncoding; // override strCombinedIOResult += MCD_T("forced "); strCombinedIOResult += file.strEncoding + MCD_T(" "); } } bSuccess = x_WriteText( strDoc, file ); x_Close( file ); strCombinedIOResult += file.strIOResult; if ( pnDocFlags ) *pnDocFlags = file.nDocFlags; } if ( pstrError ) *pstrError = strCombinedIOResult; return bSuccess; } bool CMarkup::FindElem( MCD_CSTR szName ) { // Change current position only if found // if ( m_aPos.GetSize() ) { int iPos = x_FindElem( m_iPosParent, m_iPos, szName ); if ( iPos ) { // Assign new position x_SetPos( m_aPos[iPos].iElemParent, iPos, 0 ); return true; } } return false; } bool CMarkup::FindChildElem( MCD_CSTR szName ) { // Change current child position only if found // // Shorthand: call this with no current main position // means find child under root element if ( ! m_iPos ) FindElem(); int iPosChild = x_FindElem( m_iPos, m_iPosChild, szName ); if ( iPosChild ) { // Assign new position int iPos = m_aPos[iPosChild].iElemParent; x_SetPos( m_aPos[iPos].iElemParent, iPos, iPosChild ); return true; } return false; } MCD_STR CMarkup::EscapeText( MCD_CSTR szText, int nFlags ) { // Convert text as seen outside XML document to XML friendly // replacing special characters with ampersand escape codes // E.g. convert "6>7" to "6>7" // // < less than // & ampersand // > greater than // // and for attributes: // // ' apostrophe or single quote // " double quote // static MCD_PCSZ apReplace[] = { MCD_T("<"),MCD_T("&"),MCD_T(">"),MCD_T("'"),MCD_T(""") }; MCD_PCSZ pFind = (nFlags&MNF_ESCAPEQUOTES)?MCD_T("<&>\'\""):MCD_T("<&>"); MCD_STR strText; MCD_PCSZ pSource = szText; int nDestSize = MCD_PSZLEN(pSource); nDestSize += nDestSize / 10 + 7; MCD_BLDRESERVE(strText,nDestSize); MCD_CHAR cSource = *pSource; MCD_PCSZ pFound; int nCharLen; while ( cSource ) { MCD_BLDCHECK(strText,nDestSize,6); if ( (pFound=MCD_PSZCHR(pFind,cSource)) != NULL ) { bool bIgnoreAmpersand = false; if ( (nFlags&MNF_WITHREFS) && *pFound == '&' ) { // Do not replace ampersand if it is start of any entity reference // &[#_:A-Za-zU][_:-.A-Za-z0-9U]*; where U is > 0x7f MCD_PCSZ pCheckEntity = pSource; ++pCheckEntity; MCD_CHAR c = *pCheckEntity; if ( (c>='A'&&c<='Z') || (c>='a'&&c<='z') || c=='#' || c=='_' || c==':' || ((unsigned int)c)>0x7f ) { while ( 1 ) { pCheckEntity += MCD_CLEN( pCheckEntity ); c = *pCheckEntity; if ( c == ';' ) { int nEntityLen = (int)(pCheckEntity - pSource) + 1; MCD_BLDAPPENDN(strText,pSource,nEntityLen); pSource = pCheckEntity; bIgnoreAmpersand = true; } else if ( (c>='A'&&c<='Z') || (c>='a'&&c<='z') || (c>='0'&&c<='9') || c=='_' || c==':' || c=='-' || c=='.' || ((unsigned int)c)>0x7f ) continue; break; } } } if ( ! bIgnoreAmpersand ) { pFound = apReplace[pFound-pFind]; MCD_BLDAPPEND(strText,pFound); } ++pSource; // ASCII, so 1 byte } else { nCharLen = MCD_CLEN( pSource ); MCD_BLDAPPENDN(strText,pSource,nCharLen); pSource += nCharLen; } cSource = *pSource; } MCD_BLDRELEASE(strText); return strText; } // Predefined character entities // By default UnescapeText will decode standard HTML entities as well as the 5 in XML // To unescape only the 5 standard XML entities, use this short table instead: // MCD_PCSZ PredefEntityTable[4] = // { MCD_T("20060lt"),MCD_T("40034quot"),MCD_T("30038amp"),MCD_T("20062gt40039apos") }; // // This is a precompiled ASCII hash table for speed and minimum memory requirement // Each entry consists of a 1 digit code name length, 4 digit code point, and the code name // Each table slot can have multiple entries, table size 130 was chosen for even distribution // MCD_PCSZ PredefEntityTable[130] = { MCD_T("60216oslash60217ugrave60248oslash60249ugrave"), MCD_T("50937omega60221yacute58968lceil50969omega60253yacute"), MCD_T("50916delta50206icirc50948delta50238icirc68472weierp"),MCD_T("40185sup1"), MCD_T("68970lfloor40178sup2"), MCD_T("50922kappa60164curren50954kappa58212mdash40179sup3"), MCD_T("59830diams58211ndash"),MCD_T("68855otimes58969rceil"), MCD_T("50338oelig50212ocirc50244ocirc50339oelig58482trade"), MCD_T("50197aring50931sigma50229aring50963sigma"), MCD_T("50180acute68971rfloor50732tilde"),MCD_T("68249lsaquo"), MCD_T("58734infin68201thinsp"),MCD_T("50161iexcl"), MCD_T("50920theta50219ucirc50952theta50251ucirc"),MCD_T("58254oline"), MCD_T("58260frasl68727lowast"),MCD_T("59827clubs60191iquest68250rsaquo"), MCD_T("58629crarr50181micro"),MCD_T("58222bdquo"),MCD_T(""), MCD_T("58243prime60177plusmn58242prime"),MCD_T("40914beta40946beta"),MCD_T(""), MCD_T(""),MCD_T(""),MCD_T("50171laquo50215times"),MCD_T("40710circ"), MCD_T("49001lang"),MCD_T("58220ldquo40175macr"), MCD_T("40182para50163pound48476real"),MCD_T(""),MCD_T("58713notin50187raquo"), MCD_T("48773cong50223szlig50978upsih"), MCD_T("58776asymp58801equiv49002rang58218sbquo"), MCD_T("50222thorn48659darr48595darr40402fnof58221rdquo50254thorn"), MCD_T("40162cent58722minus"),MCD_T("58707exist40170ordf"),MCD_T(""), MCD_T("40921iota58709empty48660harr48596harr40953iota"),MCD_T(""), MCD_T("40196auml40228auml48226bull40167sect48838sube"),MCD_T(""), MCD_T("48656larr48592larr58853oplus"),MCD_T("30176deg58216lsquo40186ordm"), MCD_T("40203euml40039apos40235euml48712isin40160nbsp"), MCD_T("40918zeta40950zeta"),MCD_T("38743and48195emsp48719prod"), MCD_T("30935chi38745cap30967chi48194ensp"), MCD_T("40207iuml40239iuml48706part48869perp48658rarr48594rarr"), MCD_T("38736ang48836nsub58217rsquo"),MCD_T(""), MCD_T("48901sdot48657uarr48593uarr"),MCD_T("40169copy48364euro"), MCD_T("30919eta30951eta"),MCD_T("40214ouml40246ouml48839supe"),MCD_T(""), MCD_T(""),MCD_T("30038amp30174reg"),MCD_T("48733prop"),MCD_T(""), MCD_T("30208eth30934phi40220uuml30240eth30966phi40252uuml"),MCD_T(""),MCD_T(""), MCD_T(""),MCD_T("40376yuml40255yuml"),MCD_T(""),MCD_T("40034quot48204zwnj"), MCD_T("38746cup68756there4"),MCD_T("30929rho30961rho38764sim"), MCD_T("30932tau38834sub30964tau"),MCD_T("38747int38206lrm38207rlm"), MCD_T("30936psi30968psi30165yen"),MCD_T(""),MCD_T("28805ge30168uml"), MCD_T("30982piv"),MCD_T(""),MCD_T("30172not"),MCD_T(""),MCD_T("28804le"), MCD_T("30173shy"),MCD_T("39674loz28800ne38721sum"),MCD_T(""),MCD_T(""), MCD_T("38835sup"),MCD_T("28715ni"),MCD_T(""),MCD_T("20928pi20960pi38205zwj"), MCD_T(""),MCD_T("60923lambda20062gt60955lambda"),MCD_T(""),MCD_T(""), MCD_T("60199ccedil60231ccedil"),MCD_T(""),MCD_T("20060lt"), MCD_T("20926xi28744or20958xi"),MCD_T("20924mu20956mu"),MCD_T("20925nu20957nu"), MCD_T("68225dagger68224dagger"),MCD_T("80977thetasym"),MCD_T(""),MCD_T(""), MCD_T(""),MCD_T("78501alefsym"),MCD_T(""),MCD_T(""),MCD_T(""), MCD_T("60193aacute60195atilde60225aacute60227atilde"),MCD_T(""), MCD_T("70927omicron60247divide70959omicron"),MCD_T("60192agrave60224agrave"), MCD_T("60201eacute60233eacute60962sigmaf"),MCD_T("70917epsilon70949epsilon"), MCD_T(""),MCD_T("60200egrave60232egrave"),MCD_T("60205iacute60237iacute"), MCD_T(""),MCD_T(""),MCD_T("60204igrave68230hellip60236igrave"), MCD_T("60166brvbar"), MCD_T("60209ntilde68704forall58711nabla60241ntilde69824spades"), MCD_T("60211oacute60213otilde60189frac1260183middot60243oacute60245otilde"), MCD_T(""),MCD_T("50184cedil60188frac14"), MCD_T("50198aelig50194acirc60210ograve50226acirc50230aelig60242ograve"), MCD_T("50915gamma60190frac3450947gamma58465image58730radic"), MCD_T("60352scaron60353scaron"),MCD_T("60218uacute69829hearts60250uacute"), MCD_T("50913alpha50202ecirc70933upsilon50945alpha50234ecirc70965upsilon"), MCD_T("68240permil") }; MCD_STR CMarkup::UnescapeText( MCD_CSTR szText, int nTextLength /*=-1*/ ) { // Convert XML friendly text to text as seen outside XML document // ampersand escape codes replaced with special characters e.g. convert "6>7" to "6>7" // ampersand numeric codes replaced with character e.g. convert < to < // Conveniently the result is always the same or shorter in byte length // MCD_STR strText; MCD_PCSZ pSource = szText; if ( nTextLength == -1 ) nTextLength = MCD_PSZLEN(szText); MCD_BLDRESERVE(strText,nTextLength); MCD_CHAR szCodeName[10]; int nCharLen; int nChar = 0; while ( nChar < nTextLength ) { if ( pSource[nChar] == '&' ) { // Get corresponding unicode code point int nUnicode = 0; // Look for terminating semi-colon within 9 ASCII characters int nCodeLen = 0; MCD_CHAR cCodeChar = pSource[nChar+1]; while ( nCodeLen < 9 && ((unsigned int)cCodeChar) < 128 && cCodeChar != ';' ) { if ( cCodeChar >= 'A' && cCodeChar <= 'Z') // upper case? cCodeChar += ('a' - 'A'); // make lower case szCodeName[nCodeLen] = cCodeChar; ++nCodeLen; cCodeChar = pSource[nChar+1+nCodeLen]; } if ( cCodeChar == ';' ) // found semi-colon? { // Decode szCodeName szCodeName[nCodeLen] = '\0'; if ( *szCodeName == '#' ) // numeric character reference? { // Is it a hex number? int nBase = 10; // decimal int nNumberOffset = 1; // after # if ( szCodeName[1] == 'x' ) { nNumberOffset = 2; // after #x nBase = 16; // hex } nUnicode = MCD_PSZTOL( &szCodeName[nNumberOffset], NULL, nBase ); } else // does not start with # { // Look for matching code name in PredefEntityTable MCD_PCSZ pEntry = PredefEntityTable[x_Hash(szCodeName,sizeof(PredefEntityTable)/sizeof(MCD_PCSZ))]; while ( *pEntry ) { // e.g. entry: 40039apos means length 4, code point 0039, code name apos int nEntryLen = (*pEntry - '0'); ++pEntry; MCD_PCSZ pCodePoint = pEntry; pEntry += 4; if ( nEntryLen == nCodeLen && MCD_PSZNCMP(szCodeName,pEntry,nEntryLen) == 0 ) { // Convert digits to integer up to code name which always starts with alpha nUnicode = MCD_PSZTOL( pCodePoint, NULL, 10 ); break; } pEntry += nEntryLen; } } } // If a code point found, encode it into text if ( nUnicode ) { MCD_CHAR szChar[5]; nCharLen = 0; #if defined(MARKUP_WCHAR) // WCHAR #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4 szChar[0] = (MCD_CHAR)nUnicode; nCharLen = 1; #else // sizeof(wchar_t) == 2 EncodeCharUTF16( nUnicode, (unsigned short*)szChar, nCharLen ); #endif #elif defined(MARKUP_MBCS) // MBCS/double byte #if defined(MARKUP_WINCONV) int nUsedDefaultChar = 0; wchar_t wszUTF16[2]; EncodeCharUTF16( nUnicode, (unsigned short*)wszUTF16, nCharLen ); nCharLen = WideCharToMultiByte( CP_ACP, 0, wszUTF16, nCharLen, szChar, 5, NULL, &nUsedDefaultChar ); if ( nUsedDefaultChar || nCharLen <= 0 ) nUnicode = 0; #else // not WINCONV wchar_t wcUnicode = (wchar_t)nUnicode; nCharLen = wctomb( szChar, wcUnicode ); if ( nCharLen <= 0 ) nUnicode = 0; #endif // not WINCONV #else // not WCHAR and not MBCS/double byte EncodeCharUTF8( nUnicode, szChar, nCharLen ); #endif // not WCHAR and not MBCS/double byte // Increment index past ampersand semi-colon if ( nUnicode ) // must check since MBCS case can clear it { MCD_BLDAPPENDN(strText,szChar,nCharLen); nChar += nCodeLen + 2; } } if ( ! nUnicode ) { // If the code is not converted, leave it as is MCD_BLDAPPEND1(strText,'&'); ++nChar; } } else // not & { nCharLen = MCD_CLEN(&pSource[nChar]); MCD_BLDAPPENDN(strText,&pSource[nChar],nCharLen); nChar += nCharLen; } } MCD_BLDRELEASE(strText); return strText; } bool CMarkup::DetectUTF8( const char* pText, int nTextLen, int* pnNonASCII/*=NULL*/ ) { // return true if ASCII or all non-ASCII byte sequences are valid UTF-8 pattern: // ASCII 0xxxxxxx // 2-byte 110xxxxx 10xxxxxx // 3-byte 1110xxxx 10xxxxxx 10xxxxxx // 4-byte 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // *pnNonASCII is set (if pnNonASCII is not NULL) to the number of non-ASCII UTF-8 sequences // or if an invalid UTF-8 sequence is found, to 1 + the valid sequences up to the invalid sequence int nUChar; if ( pnNonASCII ) *pnNonASCII = 0; const char* pTextEnd = pText + nTextLen; while ( *pText && pText != pTextEnd ) { if ( (unsigned char)(*pText) & 0x80 ) { if ( pnNonASCII ) ++(*pnNonASCII); nUChar = DecodeCharUTF8( pText, pTextEnd ); if ( nUChar == -1 ) return false; } else ++pText; } return true; } int CMarkup::DecodeCharUTF8( const char*& pszUTF8, const char* pszUTF8End/*=NULL*/ ) { // Return Unicode code point and increment pszUTF8 past 1-4 bytes // pszUTF8End can be NULL if pszUTF8 is null terminated int nUChar = (unsigned char)*pszUTF8; ++pszUTF8; if ( nUChar & 0x80 ) { int nExtraChars; if ( ! (nUChar & 0x20) ) { nExtraChars = 1; nUChar &= 0x1f; } else if ( ! (nUChar & 0x10) ) { nExtraChars = 2; nUChar &= 0x0f; } else if ( ! (nUChar & 0x08) ) { nExtraChars = 3; nUChar &= 0x07; } else return -1; while ( nExtraChars-- ) { if ( pszUTF8 == pszUTF8End || ! (*pszUTF8 & 0x80) ) return -1; nUChar = nUChar<<6; nUChar |= *pszUTF8 & 0x3f; ++pszUTF8; } } return nUChar; } void CMarkup::EncodeCharUTF16( int nUChar, unsigned short* pwszUTF16, int& nUTF16Len ) { // Write UTF-16 sequence to pwszUTF16 for Unicode code point nUChar and update nUTF16Len // Be sure pwszUTF16 has room for up to 2 wide chars if ( nUChar & ~0xffff ) { if ( pwszUTF16 ) { // Surrogate pair nUChar -= 0x10000; pwszUTF16[nUTF16Len++] = (wchar_t)(((nUChar>>10) & 0x3ff) | 0xd800); // W1 pwszUTF16[nUTF16Len++] = (wchar_t)((nUChar & 0x3ff) | 0xdc00); // W2 } else nUTF16Len += 2; } else { if ( pwszUTF16 ) pwszUTF16[nUTF16Len++] = (wchar_t)nUChar; else ++nUTF16Len; } } int CMarkup::DecodeCharUTF16( const unsigned short*& pwszUTF16, const unsigned short* pszUTF16End/*=NULL*/ ) { // Return Unicode code point and increment pwszUTF16 past 1 or 2 (if surrogrates) UTF-16 code points // pszUTF16End can be NULL if pszUTF16 is zero terminated int nUChar = *pwszUTF16; ++pwszUTF16; if ( (nUChar & ~0x000007ff) == 0xd800 ) // W1 { if ( pwszUTF16 == pszUTF16End || ! (*pwszUTF16) ) // W2 return -1; // incorrect UTF-16 nUChar = (((nUChar & 0x3ff) << 10) | (*pwszUTF16 & 0x3ff)) + 0x10000; ++pwszUTF16; } return nUChar; } void CMarkup::EncodeCharUTF8( int nUChar, char* pszUTF8, int& nUTF8Len ) { // Write UTF-8 sequence to pszUTF8 for Unicode code point nUChar and update nUTF8Len // Be sure pszUTF8 has room for up to 4 bytes if ( ! (nUChar & ~0x0000007f) ) // < 0x80 { if ( pszUTF8 ) pszUTF8[nUTF8Len++] = (char)nUChar; else ++nUTF8Len; } else if ( ! (nUChar & ~0x000007ff) ) // < 0x800 { if ( pszUTF8 ) { pszUTF8[nUTF8Len++] = (char)(((nUChar&0x7c0)>>6)|0xc0); pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80); } else nUTF8Len += 2; } else if ( ! (nUChar & ~0x0000ffff) ) // < 0x10000 { if ( pszUTF8 ) { pszUTF8[nUTF8Len++] = (char)(((nUChar&0xf000)>>12)|0xe0); pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80); pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80); } else nUTF8Len += 3; } else // < 0x110000 { if ( pszUTF8 ) { pszUTF8[nUTF8Len++] = (char)(((nUChar&0x1c0000)>>18)|0xf0); pszUTF8[nUTF8Len++] = (char)(((nUChar&0x3f000)>>12)|0x80); pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80); pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80); } else nUTF8Len += 4; } } int CMarkup::UTF16To8( char* pszUTF8, const unsigned short* pwszUTF16, int nUTF8Count ) { // Supports the same arguments as wcstombs // the pwszUTF16 source must be a NULL-terminated UTF-16 string // if pszUTF8 is NULL, the number of bytes required is returned and nUTF8Count is ignored // otherwise pszUTF8 is filled with the result string and NULL-terminated if nUTF8Count allows // nUTF8Count is the byte size of pszUTF8 and must be large enough for the NULL if NULL desired // and the number of bytes (excluding NULL) is returned // int nUChar, nUTF8Len = 0; while ( *pwszUTF16 ) { // Decode UTF-16 nUChar = DecodeCharUTF16( pwszUTF16, NULL ); if ( nUChar == -1 ) nUChar = '?'; // Encode UTF-8 if ( pszUTF8 && nUTF8Len + 4 > nUTF8Count ) { int nUTF8LenSoFar = nUTF8Len; EncodeCharUTF8( nUChar, NULL, nUTF8Len ); if ( nUTF8Len > nUTF8Count ) return nUTF8LenSoFar; nUTF8Len = nUTF8LenSoFar; } EncodeCharUTF8( nUChar, pszUTF8, nUTF8Len ); } if ( pszUTF8 && nUTF8Len < nUTF8Count ) pszUTF8[nUTF8Len] = 0; return nUTF8Len; } int CMarkup::UTF8To16( unsigned short* pwszUTF16, const char* pszUTF8, int nUTF8Count ) { // Supports the same arguments as mbstowcs // the pszUTF8 source must be a UTF-8 string which will be processed up to NULL-terminator or nUTF8Count // if pwszUTF16 is NULL, the number of UTF-16 chars required is returned // nUTF8Count is maximum UTF-8 bytes to convert and should include NULL if NULL desired in result // if pwszUTF16 is not NULL it is filled with the result string and it must be large enough // result will be NULL-terminated if NULL encountered in pszUTF8 before nUTF8Count // and the number of UTF-8 bytes converted is returned // const char* pszPosUTF8 = pszUTF8; const char* pszUTF8End = pszUTF8 + nUTF8Count; int nUChar, nUTF8Len = 0, nUTF16Len = 0; while ( pszPosUTF8 != pszUTF8End ) { nUChar = DecodeCharUTF8( pszPosUTF8, pszUTF8End ); if ( ! nUChar ) { if ( pwszUTF16 ) pwszUTF16[nUTF16Len] = 0; break; } else if ( nUChar == -1 ) nUChar = '?'; // Encode UTF-16 EncodeCharUTF16( nUChar, pwszUTF16, nUTF16Len ); } nUTF8Len = (int)(pszPosUTF8 - pszUTF8); if ( ! pwszUTF16 ) return nUTF16Len; return nUTF8Len; } #if ! defined(MARKUP_WCHAR) // not WCHAR MCD_STR CMarkup::UTF8ToA( MCD_CSTR pszUTF8, int* pnFailed/*=NULL*/ ) { // Converts from UTF-8 to locale ANSI charset MCD_STR strANSI; int nMBLen = (int)MCD_PSZLEN( pszUTF8 ); if ( pnFailed ) *pnFailed = 0; if ( nMBLen ) { ConvertEncoding convert( MCD_T(""), MCD_T("UTF-8"), (const void*)pszUTF8, nMBLen ); convert.nToCount = nMBLen; MCD_CHAR* pANSIBuffer = MCD_GETBUFFER(strANSI,convert.nToCount); nMBLen = x_ConvertEncoding( convert, (void*)pANSIBuffer ); MCD_RELEASEBUFFER(strANSI,pANSIBuffer,nMBLen); if ( pnFailed ) *pnFailed = convert.nFailedChars; } return strANSI; } MCD_STR CMarkup::AToUTF8( MCD_CSTR pszANSI ) { // Converts locale ANSI charset to UTF-8 MCD_STR strUTF8; int nMBLen = (int)MCD_PSZLEN( pszANSI ); if ( nMBLen ) { ConvertEncoding convert( MCD_T("UTF-8"), MCD_T(""), (const void*)pszANSI, nMBLen ); convert.nToCount = nMBLen * 4; MCD_CHAR* pUTF8Buffer = MCD_GETBUFFER(strUTF8,convert.nToCount); nMBLen = x_ConvertEncoding( convert, (void*)pUTF8Buffer ); MCD_RELEASEBUFFER(strUTF8,pUTF8Buffer,nMBLen); } return strUTF8; } #endif // not WCHAR MCD_STR CMarkup::GetDeclaredEncoding( MCD_CSTR szDoc ) { // Extract encoding attribute from XML Declaration, or HTML meta charset MCD_STR strEncoding; TokenPos token( szDoc, MDF_IGNORECASE ); NodePos node; bool bHtml = false; int nTypeFound = 0; while ( nTypeFound >= 0 ) { nTypeFound = x_ParseNode( token, node ); int nNext = token.nNext; if ( nTypeFound == MNT_PROCESSING_INSTRUCTION && token.nL == 0 ) { token.nNext = node.nStart + 2; // after if ( x_FindAttrib( token, MCD_T("encoding") ) ) strEncoding = x_GetToken( token ); break; } } else if ( nTypeFound == 0 ) // end tag { // Check for end of HTML head token.nNext = node.nStart + 2; // after int nAttribOffset = node.nStart + 1; token.nNext = nAttribOffset; if ( x_FindAttrib(token,MCD_T("http-equiv")) && token.Match(MCD_T("Content-Type")) ) { token.nNext = nAttribOffset; if ( x_FindAttrib(token,MCD_T("content")) ) { int nContentEndOffset = token.nNext; token.nNext = token.nL; while ( token.nNext < nContentEndOffset && x_FindName(token) ) { if ( token.Match(MCD_T("charset")) && x_FindName(token) && token.Match(MCD_T("=")) ) { x_FindName( token ); strEncoding = x_GetToken( token ); break; } } } break; } } } token.nNext = nNext; } return strEncoding; } int CMarkup::FindNode( int nType ) { // Change current node position only if a node is found // If nType is 0 find any node, otherwise find node of type nType // Return type of node or 0 if not found // If found node is an element, change m_iPos // Determine where in document to start scanning for node int nTypeFound = 0; int nNodeOffset = m_nNodeOffset; if ( m_nNodeType > 1 ) { // By-pass current node nNodeOffset += m_nNodeLength; } else { // Set position to begin looking for node nNodeOffset = 0; // default to start of document if ( m_iPos ) { // After element nNodeOffset = m_aPos[m_iPos].StartAfter(); } else if ( m_iPosParent ) { // Immediately after start tag of parent if ( m_aPos[m_iPosParent].IsEmptyElement() ) return 0; else nNodeOffset = m_aPos[m_iPosParent].StartContent(); } } // Get nodes until we find what we're looking for int iPosNew = m_iPos; TokenPos token( m_strDoc, m_nDocFlags ); NodePos node; token.nNext = nNodeOffset; do { nNodeOffset = token.nNext; nTypeFound = x_ParseNode( token, node ); if ( nTypeFound == 0 ) { // Check if we have reached the end of the parent element // Otherwise it is a lone end tag if ( m_iPosParent && nNodeOffset == m_aPos[m_iPosParent].StartContent() + m_aPos[m_iPosParent].ContentLen() ) return 0; nTypeFound = MNT_LONE_END_TAG; } else if ( nTypeFound < 0 ) { if ( nTypeFound == -2 ) return 0; // -1 is node error nTypeFound = MNT_NODE_ERROR; } else if ( nTypeFound == MNT_ELEMENT ) { if ( iPosNew ) iPosNew = m_aPos[iPosNew].iElemNext; else iPosNew = m_aPos[m_iPosParent].iElemChild; if ( ! iPosNew ) return 0; if ( ! nType || (nType & nTypeFound) ) { // Found element node, move position to this element x_SetPos( m_iPosParent, iPosNew, 0 ); return m_nNodeType; } token.nNext = m_aPos[iPosNew].StartAfter(); } } while ( nType && ! (nType & nTypeFound) ); m_iPos = iPosNew; m_iPosChild = 0; m_nNodeOffset = nNodeOffset; m_nNodeLength = token.nNext - nNodeOffset; m_nNodeType = nTypeFound; MARKUP_SETDEBUGSTATE; return m_nNodeType; } bool CMarkup::RemoveNode() { if ( m_iPos || m_nNodeLength ) { x_RemoveNode( m_iPosParent, m_iPos, m_nNodeType, m_nNodeOffset, m_nNodeLength ); m_iPosChild = 0; MARKUP_SETDEBUGSTATE; return true; } return false; } MCD_STR CMarkup::GetTagName() const { // Return the tag name at the current main position MCD_STR strTagName; // This method is primarily for elements, however // it does return something for certain other nodes if ( m_nNodeLength ) { switch ( m_nNodeType ) { case MNT_PROCESSING_INSTRUCTION: case MNT_LONE_END_TAG: { // nMapSize); SavedPos* pSavedPos = pMap->pTable[nSlot]; int nOffset = 0; if ( ! pSavedPos ) { pSavedPos = new SavedPos[2]; pSavedPos[1].nSavedPosFlags = SavedPos::SPM_LAST; pMap->pTable[nSlot] = pSavedPos; } else { while ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED ) { if ( pSavedPos[nOffset].strName == (MCD_PCSZ)szPosName ) break; if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST ) { int nNewSize = (nOffset + 6) * 2; SavedPos* pNewSavedPos = new SavedPos[nNewSize]; for ( int nCopy=0; nCopy<=nOffset; ++nCopy ) pNewSavedPos[nCopy] = pSavedPos[nCopy]; pNewSavedPos[nOffset].nSavedPosFlags ^= SavedPos::SPM_LAST; pNewSavedPos[nNewSize-1].nSavedPosFlags = SavedPos::SPM_LAST; delete [] pSavedPos; pSavedPos = pNewSavedPos; pMap->pTable[nSlot] = pSavedPos; ++nOffset; break; } ++nOffset; } } if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST ) savedpos.nSavedPosFlags |= SavedPos::SPM_LAST; pSavedPos[nOffset] = savedpos; /* // To review hash table balance, uncomment and watch strBalance MCD_STR strBalance, strSlot; for ( nSlot=0; nSlot < pMap->nMapSize; ++nSlot ) { pSavedPos = pMap->pTable[nSlot]; int nCount = 0; while ( pSavedPos && pSavedPos->nSavedPosFlags & SavedPos::SPM_USED ) { ++nCount; if ( pSavedPos->nSavedPosFlags & SavedPos::SPM_LAST ) break; ++pSavedPos; } strSlot.Format( MCD_T("%d "), nCount ); strBalance += strSlot; } */ return true; } return false; } bool CMarkup::RestorePos( MCD_CSTR szPosName /*=""*/, int nMap /*=0*/ ) { // Restore element position if found in saved position map if ( szPosName ) { SavedPosMap* pMap; x_GetMap( pMap, nMap ); int nSlot = x_Hash( szPosName, pMap->nMapSize ); SavedPos* pSavedPos = pMap->pTable[nSlot]; if ( pSavedPos ) { int nOffset = 0; while ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED ) { if ( pSavedPos[nOffset].strName == (MCD_PCSZ)szPosName ) { int i = pSavedPos[nOffset].iPos; if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_CHILD ) x_SetPos( m_aPos[m_aPos[i].iElemParent].iElemParent, m_aPos[i].iElemParent, i ); else if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_MAIN ) x_SetPos( m_aPos[i].iElemParent, i, 0 ); else x_SetPos( i, 0, 0 ); return true; } if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST ) break; ++nOffset; } } } return false; } bool CMarkup::SetMapSize( int nSize, int nMap /*=0*/ ) { // Set saved position map hash table size before using it // Returns false if map already exists // Some prime numbers: 53, 101, 211, 503, 1009, 2003, 10007, 20011, 50021, 100003, 200003, 500009 SavedPosMap* pNewMap; return x_GetMap( pNewMap, nMap, nSize ); } bool CMarkup::RemoveElem() { // Remove current main position element if ( m_iPos && m_nNodeType == MNT_ELEMENT ) { int iPos = x_RemoveElem( m_iPos ); x_SetPos( m_iPosParent, iPos, 0 ); return true; } return false; } bool CMarkup::RemoveChildElem() { // Remove current child position element if ( m_iPosChild ) { int iPosChild = x_RemoveElem( m_iPosChild ); x_SetPos( m_iPosParent, m_iPos, iPosChild ); return true; } return false; } ////////////////////////////////////////////////////////////////////// // Private Methods ////////////////////////////////////////////////////////////////////// MCD_STR CMarkup::x_GetLastError() { // strerror is has difficulties cross-platform // VC++ leaves MCD_STRERROR undefined and uses FormatMessage // Non-VC++ use strerror (even for MARKUP_WCHAR and convert) // additional notes: // _WIN32_WCE (Windows CE) has no strerror (Embedded VC++ uses FormatMessage) // _MSC_VER >= 1310 (VC++ 2003/7.1) has _wcserror (but not used) // const int nErrorBufferSize = 100; MCD_CHAR szError[nErrorBufferSize+1]; #if defined(MCD_STRERROR) // C error routine #if defined(MARKUP_WCHAR) char szMBError[nErrorBufferSize+1]; strncpy( szMBError, MCD_STRERROR, nErrorBufferSize ); szMBError[nErrorBufferSize] = '\0'; ConvertEncoding convert( MCD_ENC, MCD_T(""), (const void*)szMBError, strlen(szMBError) ); convert.nToCount = nErrorBufferSize; int nWideLen = x_ConvertEncoding( convert, (void*)szError ); szError[nWideLen] = '\0'; #else MCD_PSZNCPY( szError, MCD_STRERROR, nErrorBufferSize ); szError[nErrorBufferSize] = '\0'; #endif #else // no C error routine, use Windows API if ( ::FormatMessage(0x1200,0,::GetLastError(),0,szError,nErrorBufferSize,0) < 1 ) szError[0] = '\0'; #endif // no C error routine MCD_STR strError = szError; for ( int nChar=0; nChar>1); // Grow By: multiply size by 1.5 if ( m_aPos.GetSize() < nNewSize ) { // Grow By: new size can be at most one more complete segment int nSeg = (m_aPos.GetSize()?m_aPos.GetSize()-1:0) >> m_aPos.PA_SEGBITS; int nNewSeg = (nNewSize-1) >> m_aPos.PA_SEGBITS; if ( nNewSeg > nSeg + 1 ) { nNewSeg = nSeg + 1; nNewSize = (nNewSeg+1) << m_aPos.PA_SEGBITS; } // Allocate array of segments if ( m_aPos.nSegs <= nNewSeg ) { int nNewSegments = 4 + nNewSeg * 2; char* pNewSegments = new char[nNewSegments*sizeof(char*)]; if ( m_aPos.SegsUsed() ) memcpy( pNewSegments, m_aPos.pSegs, m_aPos.SegsUsed()*sizeof(char*) ); if ( m_aPos.pSegs ) delete[] (char*)m_aPos.pSegs; m_aPos.pSegs = (ElemPos**)pNewSegments; m_aPos.nSegs = nNewSegments; } // Calculate segment sizes int nSegSize = m_aPos.GetSize() - (nSeg << m_aPos.PA_SEGBITS); int nNewSegSize = nNewSize - (nNewSeg << m_aPos.PA_SEGBITS); // Complete first segment int nFullSegSize = 1 << m_aPos.PA_SEGBITS; if ( nSeg < nNewSeg && nSegSize < nFullSegSize ) { char* pNewFirstSeg = new char[ nFullSegSize * sizeof(ElemPos) ]; if ( nSegSize ) { // Reallocate memcpy( pNewFirstSeg, m_aPos.pSegs[nSeg], nSegSize * sizeof(ElemPos) ); delete[] (char*)m_aPos.pSegs[nSeg]; } m_aPos.pSegs[nSeg] = (ElemPos*)pNewFirstSeg; } // New segment char* pNewSeg = new char[ nNewSegSize * sizeof(ElemPos) ]; if ( nNewSeg == nSeg && nSegSize ) { // Reallocate memcpy( pNewSeg, m_aPos.pSegs[nSeg], nSegSize * sizeof(ElemPos) ); delete[] (char*)m_aPos.pSegs[nSeg]; } m_aPos.pSegs[nNewSeg] = (ElemPos*)pNewSeg; m_aPos.nSize = nNewSize; } return true; } bool CMarkup::x_ParseDoc() { // Preserve pre-parse result MCD_STR strResult = m_strError; // Reset indexes ResetPos(); m_SavedPosMapArray.RemoveAll(); // Starting size of position array: 1 element per 64 bytes of document // Tight fit when parsing small doc, only 0 to 2 reallocs when parsing large doc // Start at 8 when creating new document int nDocLen = MCD_STRLENGTH(m_strDoc); m_iPosFree = 1; x_AllocPosArray( nDocLen / 64 + 8 ); m_iPosDeleted = 0; // Parse document m_aPos[0].ClearVirtualParent(); if ( nDocLen ) { TokenPos token( m_strDoc, m_nDocFlags ); int iPos = x_ParseElem( 0, token ); m_aPos[0].nLength = nDocLen; if ( iPos > 0 ) { m_aPos[0].iElemChild = iPos; if ( m_aPos[iPos].iElemNext ) m_strError = MCD_T("Root element has sibling"); } else m_strError = MCD_T("No root element"); } else m_strError = MCD_T("Empty document"); ResetPos(); // Combine preserved result with parse error if ( ! MCD_STRISEMPTY(strResult) ) { if ( MCD_STRISEMPTY(m_strError) ) m_strError = strResult; else m_strError = strResult + MCD_T(" ") + m_strError; } return IsWellFormed(); } int CMarkup::x_ParseElem( int iPosParent, TokenPos& token ) { // This is either called by x_ParseDoc or x_AddSubDoc or x_SetElemContent // Returns index of the first element encountered or zero if no elements // int iElemRoot = 0; int iPos = iPosParent; int iVirtualParent = iPosParent; int nRootDepth = m_aPos[iPos].Level(); token.nNext = 0; MCD_STRCLEAR(m_strError); // Loop through the nodes of the document NodeStack aNodes; aNodes.Add(); int nDepth = 0; int nMatchDepth; int iPosChild; int iPosMatch; int nTypeFound = 0; ElemPos* pElem; int iElemFirst, iElemLast; while ( 1 ) { nTypeFound = x_ParseNode( token, aNodes.Top() ); nMatchDepth = 0; if ( nTypeFound == MNT_ELEMENT ) // start tag { iPos = x_GetFreePos(); if ( ! iElemRoot ) iElemRoot = iPos; pElem = &m_aPos[iPos]; pElem->iElemParent = iPosParent; pElem->iElemNext = 0; if ( m_aPos[iPosParent].iElemChild ) { iElemFirst = m_aPos[iPosParent].iElemChild; iElemLast = m_aPos[iElemFirst].iElemPrev; m_aPos[iElemLast].iElemNext = iPos; pElem->iElemPrev = iElemLast; m_aPos[iElemFirst].iElemPrev = iPos; pElem->nFlags = 0; } else { m_aPos[iPosParent].iElemChild = iPos; pElem->iElemPrev = iPos; pElem->nFlags = MNF_FIRST; } pElem->SetLevel( nRootDepth + nDepth ); pElem->iElemChild = 0; pElem->nStart = aNodes.Top().nStart; pElem->SetStartTagLen( aNodes.Top().nLength ); if ( aNodes.Top().nNodeFlags & MNF_EMPTY ) { iPos = iPosParent; pElem->SetEndTagLen( 0 ); pElem->nLength = aNodes.Top().nLength; } else { iPosParent = iPos; ++nDepth; aNodes.Add(); } } else if ( nTypeFound == 0 ) // end tag { nMatchDepth = nDepth; iPosMatch = iPos; while ( nMatchDepth && ! token.Match(aNodes.At(nMatchDepth-1).strMeta) ) { /* // Auto-switch case sensitivity if ( ! (token.nTokenFlags & MDF_IGNORECASE ) ) { token.nTokenFlags |= MDF_IGNORECASE; if ( token.Match(aNodes.At(nMatchDepth-1).strMeta) ) break; token.nTokenFlags |= MDF_IGNORECASE; } */ --nMatchDepth; iPosMatch = m_aPos[iPosMatch].iElemParent; } if ( nMatchDepth == 0 ) { // Not matched at all, it is a lone end tag, a non-element node m_aPos[iVirtualParent].nFlags |= MNF_ILLFORMED; m_aPos[iPos].nFlags |= MNF_ILLDATA; if ( MCD_STRISEMPTY(m_strError) ) { m_strError = MCD_T("No start tag for end tag '"); m_strError += x_GetToken(token); m_strError += MCD_T("' at offset "); m_strError += x_IntToStr( aNodes.Top().nStart ); } } else { pElem = &m_aPos[iPosMatch]; pElem->nLength = aNodes.Top().nStart - pElem->nStart + aNodes.Top().nLength; pElem->SetEndTagLen( aNodes.Top().nLength ); } } else if ( nTypeFound == -1 ) { m_aPos[iVirtualParent].nFlags |= MNF_ILLFORMED; m_aPos[iPos].nFlags |= MNF_ILLDATA; if ( MCD_STRISEMPTY(m_strError) ) m_strError = aNodes.Top().strMeta; } // Matched end tag, or end of document if ( nMatchDepth || nTypeFound == -2 ) { if ( nDepth > nMatchDepth ) m_aPos[iVirtualParent].nFlags |= MNF_ILLFORMED; // Process any non-ended elements while ( nDepth > nMatchDepth ) { // Element with no end tag pElem = &m_aPos[iPos]; iPosChild = pElem->iElemChild; iPosParent = pElem->iElemParent; pElem->SetEndTagLen( 0 ); pElem->nFlags |= MNF_NONENDED; pElem->iElemChild = 0; pElem->nLength = pElem->StartTagLen(); if ( pElem->nFlags & MNF_ILLDATA ) { pElem->nFlags ^= MNF_ILLDATA; m_aPos[iPosParent].nFlags |= MNF_ILLDATA; } while ( iPosChild ) { m_aPos[iPosChild].iElemParent = iPosParent; m_aPos[iPosChild].iElemPrev = iPos; m_aPos[iPos].iElemNext = iPosChild; iPos = iPosChild; iPosChild = m_aPos[iPosChild].iElemNext; } iPos = iPosParent; aNodes.Remove(); --nDepth; // Error string // if end tag did not match, top node is end tag that did not match pElem // if end of document, any nodes below top have no end tag if ( MCD_STRISEMPTY(m_strError) ) { if ( nTypeFound == 0 ) { m_strError = MCD_T("End tag '") + x_GetToken(token); m_strError += MCD_T("' at offset ") + x_IntToStr( token.nL-1 ); m_strError += MCD_T(" does not match start tag '") + aNodes.Top().strMeta; m_strError += MCD_T("' at offset ") + x_IntToStr( pElem->nStart ); } else { m_strError = MCD_T("Element '") + aNodes.Top().strMeta; m_strError += MCD_T("' at offset ") + x_IntToStr( aNodes.Top().nStart ); m_strError += MCD_T(" not ended"); } } } if ( nTypeFound == -2 ) break; iPosParent = m_aPos[iPos].iElemParent; iPos = iPosParent; aNodes.Remove(); --nDepth; } } return iElemRoot; } bool CMarkup::x_FindAny( MCD_PCSZ pDoc, int& nChar ) { // Starting at nChar, find a non-whitespace char // return false if no non-whitespace before end of document, nChar points to end // otherwise return true and nChar points to non-whitespace char while ( pDoc[nChar] && MCD_PSZCHR(MCD_T(" \t\n\r"),pDoc[nChar]) ) ++nChar; return pDoc[nChar] != '\0'; } bool CMarkup::x_FindName( CMarkup::TokenPos& token ) { // Starting at token.nNext, bypass whitespace and find the next name // returns true on success, members of token point to token // returns false on end of document, members point to end of document MCD_PCSZ pDoc = token.pDoc; int nChar = token.nNext; // By-pass leading whitespace if ( ! x_FindAny(pDoc,nChar) ) { // No token was found before end of document token.nL = nChar; token.nR = nChar - 1; token.nNext = nChar; return false; } // Go until special char or whitespace token.nL = nChar; while ( pDoc[nChar] && ! MCD_PSZCHR(MCD_T(" \t\n\r<>=\\/?!\"';"),pDoc[nChar]) ) nChar += MCD_CLEN(&pDoc[nChar]); // Adjust end position if it is one special char if ( nChar == token.nL ) ++nChar; // it is a special char token.nR = nChar - 1; // nNext points to one past last char of token token.nNext = nChar; return true; } MCD_STR CMarkup::x_GetToken( const CMarkup::TokenPos& token ) { // The token contains indexes into the document identifying a small substring // Build the substring from those indexes and return it if ( token.nL > token.nR ) return MCD_T(""); MCD_STR strToken( &token.pDoc[token.nL], token.Length() ); return strToken; } int CMarkup::x_FindElem( int iPosParent, int iPos, MCD_PCSZ pPath ) const { // If pPath is NULL or empty, go to next sibling element // Otherwise go to next sibling element with matching path // if ( iPos ) iPos = m_aPos[iPos].iElemNext; else iPos = m_aPos[iPosParent].iElemChild; // Finished here if pPath not specified if ( pPath == NULL || !pPath[0] ) return iPos; // Search TokenPos token( m_strDoc, m_nDocFlags ); while ( iPos ) { // Compare tag name token.nNext = m_aPos[iPos].nStart + 1; x_FindName( token ); // Locate tag name if ( token.Match(pPath) ) return iPos; iPos = m_aPos[iPos].iElemNext; } return 0; } int CMarkup::x_ParseNode( CMarkup::TokenPos& token, CMarkup::NodePos& node ) { // Call this with token.nNext set to the start of the node or tag // Upon return token.nNext points to the char after the node or tag // // comment // dtd // processing instruction // cdata section // element start tag // element end tag // // returns the nodetype or // 0 for end tag // -1 for bad node // -2 for end of document // enum ParseBits { PD_OPENTAG = 1, PD_BANG = 2, PD_DASH = 4, PD_BRACKET = 8, PD_TEXTORWS = 16, PD_DOCTYPE = 32, PD_INQUOTE_S = 64, PD_INQUOTE_D = 128, PD_EQUALS = 256 }; int nParseFlags = 0; MCD_PCSZ pFindEnd = NULL; int nNodeType = -1; int nEndLen = 0; int nName = 0; unsigned int cDminus1 = 0, cDminus2 = 0; #define FINDNODETYPE(e,t) { pFindEnd=e; nEndLen=(sizeof(e)-1)/sizeof(MCD_CHAR); nNodeType=t; } #define FINDNODETYPENAME(e,t,n) { FINDNODETYPE(e,t) nName=(int)(pDoc-token.pDoc)+n-1; } #define FINDNODEERR(e) node.strMeta=MCD_T("Incorrect "); node.strMeta+=e; node.strMeta+=MCD_T(" at offset ")+x_IntToStr(nR) #define FINDNODEBAD(e) { pFindEnd=MCD_T(">"); nEndLen=1; FINDNODEERR(e); nNodeType=-1; } node.nStart = token.nNext; node.nNodeFlags = 0; int nR = token.nNext; MCD_PCSZ pDoc = &token.pDoc[nR]; register unsigned int cD = (unsigned int)*pDoc; if ( ! cD ) { node.nLength = 0; node.nNodeType = 0; return -2; // end of document } while ( 1 ) { cD = (unsigned int)*pDoc; if ( ! cD ) { nR = (int)(pDoc - token.pDoc) - 1; if ( nNodeType != MNT_WHITESPACE && nNodeType != MNT_TEXT ) { MCD_PCSZ pType = MCD_T("tag"); if ( (nParseFlags & PD_DOCTYPE) || nNodeType == MNT_DOCUMENT_TYPE ) pType = MCD_T("Doctype"); else if ( nNodeType == MNT_ELEMENT ) pType = MCD_T("Element tag"); else if ( nNodeType == 0 ) pType = MCD_T("Element end tag"); else if ( nNodeType == MNT_CDATA_SECTION ) pType = MCD_T("CDATA Section"); else if ( nNodeType == MNT_PROCESSING_INSTRUCTION ) pType = MCD_T("Processing instruction"); else if ( nNodeType == MNT_COMMENT ) pType = MCD_T("Comment"); nNodeType = -1; node.strMeta = pType; node.strMeta += MCD_T(" at offset ") + x_IntToStr(node.nStart); node.strMeta += MCD_T(" unterminated"); } break; } if ( nName ) { if ( MCD_PSZCHR(MCD_T(" \t\n\r/>"),(MCD_CHAR)cD) ) { int nNameLen = (int)(pDoc - token.pDoc) - nName; if ( nNodeType == 0 ) { token.nL = nName; token.nR = nName + nNameLen - 1; } else { MCD_STRASSIGN(node.strMeta,&token.pDoc[nName],nNameLen); } nName = 0; cDminus2 = 0; cDminus1 = 0; } else { pDoc += MCD_CLEN( pDoc ); continue; } } if ( pFindEnd ) { if ( cD == '>' && ! (nParseFlags & (PD_INQUOTE_S|PD_INQUOTE_D)) ) { nR = (int)(pDoc - token.pDoc); if ( nEndLen == 1 ) { pFindEnd = NULL; if ( nNodeType == MNT_ELEMENT && cDminus1 == '/' ) { if ( (! cDminus2) || MCD_PSZCHR(MCD_T(" \t\n\r\'\""),(MCD_CHAR)cDminus2) ) node.nNodeFlags |= MNF_EMPTY; } } else if ( nR > nEndLen ) { // Test for end of PI or comment MCD_PCSZ pEnd = pDoc - nEndLen + 1; MCD_PCSZ pInFindEnd = pFindEnd; int nLen = nEndLen; while ( --nLen && *pEnd++ == *pInFindEnd++ ); if ( nLen == 0 ) pFindEnd = NULL; } if ( ! pFindEnd && ! (nParseFlags & PD_DOCTYPE) ) break; } else if ( cD == '<' && (nNodeType == MNT_TEXT || nNodeType == -1) ) { nR = (int)(pDoc - token.pDoc) - 1; break; } else if ( nNodeType & MNT_ELEMENT ) { if ( (nParseFlags & (PD_INQUOTE_S|PD_INQUOTE_D)) ) { if ( cD == '\"' && (nParseFlags&PD_INQUOTE_D) ) nParseFlags ^= PD_INQUOTE_D; // off else if ( cD == '\'' && (nParseFlags&PD_INQUOTE_S) ) nParseFlags ^= PD_INQUOTE_S; // off } else // not in quotes { // Only set INQUOTE status when preceeded by equal sign if ( cD == '\"' && (nParseFlags&PD_EQUALS) ) nParseFlags ^= PD_INQUOTE_D|PD_EQUALS; // D on, equals off else if ( cD == '\'' && (nParseFlags&PD_EQUALS) ) nParseFlags ^= PD_INQUOTE_S|PD_EQUALS; // S on, equals off else if ( cD == '=' && cDminus1 != '=' && ! (nParseFlags&PD_EQUALS) ) nParseFlags ^= PD_EQUALS; // on else if ( (nParseFlags&PD_EQUALS) && ! MCD_PSZCHR(MCD_T(" \t\n\r"),(MCD_CHAR)cD) ) nParseFlags ^= PD_EQUALS; // off } cDminus2 = cDminus1; cDminus1 = cD; } else if ( nNodeType & MNT_DOCUMENT_TYPE ) { if ( cD == '\"' && ! (nParseFlags&PD_INQUOTE_S) ) nParseFlags ^= PD_INQUOTE_D; // toggle else if ( cD == '\'' && ! (nParseFlags&PD_INQUOTE_D) ) nParseFlags ^= PD_INQUOTE_S; // toggle } } else if ( nParseFlags ) { if ( nParseFlags & PD_TEXTORWS ) { if ( cD == '<' ) { nR = (int)(pDoc - token.pDoc) - 1; nNodeType = MNT_WHITESPACE; break; } else if ( ! MCD_PSZCHR(MCD_T(" \t\n\r"),(MCD_CHAR)cD) ) { nParseFlags ^= PD_TEXTORWS; FINDNODETYPE( MCD_T("<"), MNT_TEXT ) } } else if ( nParseFlags & PD_OPENTAG ) { nParseFlags ^= PD_OPENTAG; if ( cD > 0x60 || ( cD > 0x40 && cD < 0x5b ) || cD == 0x5f || cD == 0x3a ) FINDNODETYPENAME( MCD_T(">"), MNT_ELEMENT, 1 ) else if ( cD == '/' ) FINDNODETYPENAME( MCD_T(">"), 0, 2 ) else if ( cD == '!' ) nParseFlags |= PD_BANG; else if ( cD == '?' ) FINDNODETYPENAME( MCD_T("?>"), MNT_PROCESSING_INSTRUCTION, 2 ) else FINDNODEBAD( MCD_T("tag name character") ) } else if ( nParseFlags & PD_BANG ) { nParseFlags ^= PD_BANG; if ( cD == '-' ) nParseFlags |= PD_DASH; else if ( cD == '[' && !(nParseFlags & PD_DOCTYPE) ) nParseFlags |= PD_BRACKET; else if ( cD == 'D' && !(nParseFlags & PD_DOCTYPE) ) nParseFlags |= PD_DOCTYPE; else if ( MCD_PSZCHR(MCD_T("EAN"),(MCD_CHAR)cD) ) // "), MNT_DOCUMENT_TYPE ) else FINDNODEBAD( MCD_T("! tag") ) } else if ( nParseFlags & PD_DASH ) { nParseFlags ^= PD_DASH; if ( cD == '-' ) FINDNODETYPE( MCD_T("-->"), MNT_COMMENT ) else FINDNODEBAD( MCD_T("comment tag") ) } else if ( nParseFlags & PD_BRACKET ) { nParseFlags ^= PD_BRACKET; if ( cD == 'C' ) FINDNODETYPE( MCD_T("]]>"), MNT_CDATA_SECTION ) else FINDNODEBAD( MCD_T("tag") ) } else if ( nParseFlags & PD_DOCTYPE ) { if ( cD == '<' ) nParseFlags |= PD_OPENTAG; else if ( cD == '>' ) { nR = (int)(pDoc - token.pDoc); nNodeType = MNT_DOCUMENT_TYPE; break; } } } else if ( cD == '<' ) { nParseFlags |= PD_OPENTAG; } else { nNodeType = MNT_WHITESPACE; if ( MCD_PSZCHR(MCD_T(" \t\n\r"),(MCD_CHAR)cD) ) nParseFlags |= PD_TEXTORWS; else FINDNODETYPE( MCD_T("<"), MNT_TEXT ) } pDoc += MCD_CLEN( pDoc ); } token.nNext = nR + 1; node.nLength = token.nNext - node.nStart; node.nNodeType = nNodeType; return nNodeType; } MCD_STR CMarkup::x_GetPath( int iPos ) const { MCD_STR strPath; while ( iPos ) { MCD_STR strTagName = x_GetTagName( iPos ); int iPosParent = m_aPos[iPos].iElemParent; int iPosSib = 0; int nCount = 0; while ( iPosSib != iPos ) { iPosSib = x_FindElem( iPosParent, iPosSib, MCD_2PCSZ(strTagName) ); ++nCount; } if ( nCount > 1 ) { MCD_CHAR szPred[25]; MCD_SPRINTF( MCD_SSZ(szPred), MCD_T("[%d]"), nCount ); strPath = MCD_T("/") + strTagName + szPred + strPath; } else strPath = MCD_T("/") + strTagName + strPath; iPos = iPosParent; } return strPath; } MCD_STR CMarkup::x_GetTagName( int iPos ) const { // Return the tag name at specified element TokenPos token( m_strDoc, m_nDocFlags ); token.nNext = m_aPos[iPos].nStart + 1; if ( ! iPos || ! x_FindName( token ) ) return MCD_T(""); // Return substring of document return x_GetToken( token ); } bool CMarkup::x_FindAttrib( CMarkup::TokenPos& token, MCD_PCSZ pAttrib, int n/*=0*/ ) { // Return true if found, otherwise false and token.nNext is new insertion point // If pAttrib is NULL find attrib n and leave token at attrib name // If pAttrib is given, find matching attrib and leave token at value // support non-well-formed attributes e.g. href=/advanced_search?hl=en, nowrap // token also holds start and length of preceeding whitespace to support remove // int nPreSpaceStart; int nPreSpaceLength; int nChar; MCD_CHAR cFirstChar; MCD_PCSZ pDoc = token.pDoc; int nAttrib = -1; // starts at tag name int nFoundAttribNameR = 0; bool bAfterEqual = false; while ( 1 ) { // Starting at token.nNext, bypass whitespace and find the next token nChar = token.nNext; nPreSpaceStart = nChar; if ( ! x_FindAny(pDoc,nChar) ) break; nPreSpaceLength = nChar - nPreSpaceStart; // Is it an opening quote? cFirstChar = pDoc[nChar]; if ( cFirstChar == '\"' || cFirstChar == '\'' ) { token.nTokenFlags |= MNF_QUOTED; // Move past opening quote ++nChar; token.nL = nChar; // Look for closing quote while ( pDoc[nChar] && pDoc[nChar] != cFirstChar ) nChar += MCD_CLEN( &pDoc[nChar] ); // Set right to before closing quote token.nR = nChar - 1; // Set nChar past closing quote unless at end of document if ( pDoc[nChar] ) ++nChar; } else { token.nTokenFlags &= ~MNF_QUOTED; // Go until special char or whitespace token.nL = nChar; if ( bAfterEqual ) { while ( pDoc[nChar] && ! MCD_PSZCHR(MCD_T(" \t\n\r>"),pDoc[nChar]) ) nChar += MCD_CLEN( &pDoc[nChar] ); } else { while ( pDoc[nChar] && ! MCD_PSZCHR(MCD_T("= \t\n\r>/?"),pDoc[nChar]) ) nChar += MCD_CLEN( &pDoc[nChar] ); } // Adjust end position if it is one special char if ( nChar == token.nL ) ++nChar; // it is a special char token.nR = nChar - 1; } // nNext points to one past last char of token token.nNext = nChar; if ( ! bAfterEqual && ! (token.nTokenFlags&MNF_QUOTED) ) { // Is it an equal sign? MCD_CHAR cChar = pDoc[token.nL]; if ( cChar == '=' ) { bAfterEqual = true; continue; } // Is it the right angle bracket? if ( cChar == '>' || cChar == '/' || cChar == '?' ) { token.nNext = nPreSpaceStart; break; // attrib not found } if ( nFoundAttribNameR ) break; // Attribute name if ( nAttrib != -1 ) { if ( ! pAttrib ) { if ( nAttrib == n ) return true; // found by number } else if ( token.Match(pAttrib) ) { // Matched attrib name, go forward to value nFoundAttribNameR = token.nR; token.nPreSpaceStart = nPreSpaceStart; token.nPreSpaceLength = nPreSpaceLength; } } ++nAttrib; } else if ( nFoundAttribNameR ) break; bAfterEqual = false; } if ( nFoundAttribNameR ) { if ( ! bAfterEqual ) { // when attribute has no value the value is the attribute name token.nL = token.nPreSpaceStart + token.nPreSpaceLength; token.nR = nFoundAttribNameR; token.nNext = nFoundAttribNameR + 1; } return true; // found by name } return false; // not found } MCD_STR CMarkup::x_GetAttrib( int iPos, MCD_PCSZ pAttrib ) const { // Return the value of the attrib TokenPos token( m_strDoc, m_nDocFlags ); if ( iPos && m_nNodeType == MNT_ELEMENT ) token.nNext = m_aPos[iPos].nStart + 1; else if ( iPos == m_iPos && m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION ) token.nNext = m_nNodeOffset + 2; else return MCD_T(""); if ( pAttrib && x_FindAttrib( token, pAttrib ) ) return UnescapeText( &token.pDoc[token.nL], token.Length() ); return MCD_T(""); } bool CMarkup::x_SetAttrib( int iPos, MCD_PCSZ pAttrib, int nValue, int nFlags /*=0*/ ) { // Convert integer to string MCD_CHAR szVal[25]; MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue ); return x_SetAttrib( iPos, pAttrib, szVal, nFlags ); } bool CMarkup::x_SetAttrib( int iPos, MCD_PCSZ pAttrib, MCD_PCSZ pValue, int nFlags /*=0*/ ) { // Set attribute in iPos element TokenPos token( m_strDoc, m_nDocFlags ); if ( iPos && m_nNodeType == MNT_ELEMENT ) token.nNext = m_aPos[iPos].nStart + 1; else if ( iPos == m_iPos && m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION ) token.nNext = m_nNodeOffset + 2; else return false; // Create insertion text depending on whether attribute already exists // Decision: for empty value leaving attrib="" instead of removing attrib int nReplace = 0; int nInsertAt; MCD_STR strInsert; strInsert += x_ATTRIBQUOTE; strInsert += EscapeText( pValue, MNF_ESCAPEQUOTES|nFlags ); strInsert += x_ATTRIBQUOTE; if ( x_FindAttrib( token, pAttrib ) ) { // Replace value nInsertAt = token.nL - ((token.nTokenFlags&MNF_QUOTED)?1:0); nReplace = token.Length() + ((token.nTokenFlags&MNF_QUOTED)?2:0); } else { // Insert string name value pair MCD_STR strFormat; strFormat = MCD_T(" "); strFormat += pAttrib; strFormat += MCD_T("="); strFormat += strInsert; strInsert = strFormat; nInsertAt = token.nNext; } x_DocChange( nInsertAt, nReplace, strInsert ); int nAdjust = MCD_STRLENGTH(strInsert) - nReplace; if ( m_nNodeType == MNT_PROCESSING_INSTRUCTION ) { x_AdjustForNode( m_iPosParent, m_iPos, nAdjust ); m_nNodeLength += nAdjust; MARKUP_SETDEBUGSTATE; return true; } m_aPos[iPos].AdjustStartTagLen( nAdjust ); m_aPos[iPos].nLength += nAdjust; x_Adjust( iPos, nAdjust ); MARKUP_SETDEBUGSTATE; return true; } bool CMarkup::x_CreateNode( MCD_STR& strNode, int nNodeType, MCD_PCSZ pText ) { // Set strNode based on nNodeType and szData // Return false if szData would jeopardize well-formed document // switch ( nNodeType ) { case MNT_PROCESSING_INSTRUCTION: strNode = MCD_T(""); break; case MNT_COMMENT: strNode = MCD_T(""); break; case MNT_ELEMENT: strNode = MCD_T("<"); strNode += pText; strNode += MCD_T("/>"); break; case MNT_TEXT: case MNT_WHITESPACE: strNode = EscapeText( pText ); break; case MNT_DOCUMENT_TYPE: strNode = pText; break; case MNT_LONE_END_TAG: return false; case MNT_CDATA_SECTION: if ( MCD_PSZSTR(pText,MCD_T("]]>")) != NULL ) return false; strNode = MCD_T(""); break; } return true; } MCD_STR CMarkup::x_EncodeCDATASection( MCD_PCSZ szData ) { // Split CDATA Sections if there are any end delimiters MCD_STR strData = MCD_T("") ); while ( pszEnd ) { strData += MCD_STR( pszNextStart, (int)(pszEnd - pszNextStart) ); strData += MCD_T("]]]]>"); pszNextStart = pszEnd + 3; pszEnd = MCD_PSZSTR( pszNextStart, MCD_T("]]>") ); } strData += pszNextStart; strData += MCD_T("]]>"); return strData; } bool CMarkup::x_SetData( int iPos, int nValue ) { // Convert integer to string MCD_CHAR szVal[25]; MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue ); return x_SetData( iPos, szVal, 0 ); } bool CMarkup::x_SetData( int iPos, MCD_PCSZ szData, int nFlags ) { // Set data at specified position // if nFlags==1, set content of element to a CDATA Section MCD_STR strInsert; if ( iPos == m_iPos && m_nNodeLength ) { // Not an element if ( ! x_CreateNode(strInsert, m_nNodeType, szData) ) return false; x_DocChange( m_nNodeOffset, m_nNodeLength, strInsert ); x_AdjustForNode( m_iPosParent, iPos, MCD_STRLENGTH(strInsert) - m_nNodeLength ); m_nNodeLength = MCD_STRLENGTH(strInsert); MARKUP_SETDEBUGSTATE; return true; } // Set data in iPos element if ( ! iPos || m_aPos[iPos].iElemChild ) return false; // Build strInsert from szData based on nFlags if ( nFlags & MNF_WITHCDATA ) strInsert = x_EncodeCDATASection( szData ); else strInsert = EscapeText( szData, nFlags ); // Insert NodePos node( MNF_WITHNOLINES|MNF_REPLACE ); node.strMeta = strInsert; int iPosBefore = 0; int nReplace = x_InsertNew( iPos, iPosBefore, node ); int nAdjust = MCD_STRLENGTH(node.strMeta) - nReplace; x_Adjust( iPos, nAdjust ); m_aPos[iPos].nLength += nAdjust; if ( m_aPos[iPos].nFlags & MNF_ILLDATA ) m_aPos[iPos].nFlags &= ~MNF_ILLDATA; MARKUP_SETDEBUGSTATE; return true; } MCD_STR CMarkup::x_GetData( int iPos ) const { if ( iPos == m_iPos && m_nNodeLength ) { if ( m_nNodeType == MNT_COMMENT ) return MCD_STRMID( m_strDoc, m_nNodeOffset+4, m_nNodeLength-7 ); else if ( m_nNodeType == MNT_PROCESSING_INSTRUCTION ) return MCD_STRMID( m_strDoc, m_nNodeOffset+2, m_nNodeLength-4 ); else if ( m_nNodeType == MNT_CDATA_SECTION ) return MCD_STRMID( m_strDoc, m_nNodeOffset+9, m_nNodeLength-12 ); else if ( m_nNodeType == MNT_TEXT ) return UnescapeText( &(MCD_2PCSZ(m_strDoc))[m_nNodeOffset], m_nNodeLength ); else if ( m_nNodeType == MNT_LONE_END_TAG ) return MCD_STRMID( m_strDoc, m_nNodeOffset+2, m_nNodeLength-3 ); else return MCD_STRMID( m_strDoc, m_nNodeOffset, m_nNodeLength ); } // Return a string representing data between start and end tag // Return empty string if there are any children elements MCD_STR strData; if ( ! m_aPos[iPos].iElemChild && ! m_aPos[iPos].IsEmptyElement() ) { // Quick scan for any tags inside content int nContentLen = m_aPos[iPos].ContentLen(); int nStartContent = m_aPos[iPos].StartContent(); MCD_PCSZ pszContent = &(MCD_2PCSZ(m_strDoc))[nStartContent]; MCD_PCSZ pszTag = MCD_PSZCHR( pszContent, '<' ); if ( pszTag && ((int)(pszTag-pszContent) < nContentLen) ) { // Concatenate all CDATA Sections and text nodes, ignore other nodes TokenPos token( m_strDoc, m_nDocFlags ); token.nNext = nStartContent; NodePos node; while ( token.nNext < nStartContent + nContentLen ) { x_ParseNode( token, node ); if ( node.nNodeType == MNT_TEXT ) strData += UnescapeText( &token.pDoc[node.nStart], node.nLength ); else if ( node.nNodeType == MNT_CDATA_SECTION ) strData += MCD_STRMID( m_strDoc, node.nStart+9, node.nLength-12 ); } } else // no tags strData = UnescapeText( &(MCD_2PCSZ(m_strDoc))[nStartContent], nContentLen ); } return strData; } MCD_STR CMarkup::x_GetElemContent( int iPos ) const { if ( iPos && m_aPos[iPos].ContentLen() ) return MCD_STRMID( m_strDoc, m_aPos[iPos].StartContent(), m_aPos[iPos].ContentLen() ); return MCD_T(""); } bool CMarkup::x_SetElemContent( MCD_PCSZ szContent ) { // Set data in iPos element only if ( ! m_iPos ) return false; if ( m_nNodeLength ) return false; // not an element // Unlink all children int iPos = m_iPos; int iPosChild = m_aPos[iPos].iElemChild; bool bHadChild = (iPosChild != 0); while ( iPosChild ) iPosChild = x_ReleaseSubDoc( iPosChild ); if ( bHadChild ) x_CheckSavedPos(); // Parse content bool bWellFormed = true; TokenPos token( szContent, m_nDocFlags ); int iPosVirtual = x_GetFreePos(); m_aPos[iPosVirtual].ClearVirtualParent(); m_aPos[iPosVirtual].SetLevel( m_aPos[iPos].Level() + 1 ); iPosChild = x_ParseElem( iPosVirtual, token ); if ( m_aPos[iPosVirtual].nFlags & MNF_ILLFORMED ) bWellFormed = false; m_aPos[iPos].nFlags = (m_aPos[iPos].nFlags & ~MNF_ILLDATA) | (m_aPos[iPosVirtual].nFlags & MNF_ILLDATA); // Prepare insert and adjust offsets NodePos node( MNF_WITHNOLINES|MNF_REPLACE ); node.strMeta = szContent; int iPosBefore = 0; int nReplace = x_InsertNew( iPos, iPosBefore, node ); // Adjust and link in the inserted elements x_Adjust( iPosChild, node.nStart ); m_aPos[iPosChild].nStart += node.nStart; m_aPos[iPos].iElemChild = iPosChild; while ( iPosChild ) { m_aPos[iPosChild].iElemParent = iPos; iPosChild = m_aPos[iPosChild].iElemNext; } x_ReleasePos( iPosVirtual ); int nAdjust = MCD_STRLENGTH(node.strMeta) - nReplace; x_Adjust( iPos, nAdjust, true ); m_aPos[iPos].nLength += nAdjust; x_SetPos( m_iPosParent, m_iPos, 0 ); return bWellFormed; } void CMarkup::x_DocChange( int nLeft, int nReplace, const MCD_STR& strInsert ) { // Insert strInsert int m_strDoc at nLeft replacing nReplace chars // When creating a document, reduce reallocs by reserving string space // If realloc needed, allow for 1.5 times the new length // int nDocLength = MCD_STRLENGTH(m_strDoc); int nInsLength = MCD_STRLENGTH(strInsert); int nNewLength = nInsLength + nDocLength - nReplace; int nAllocLen = MCD_STRCAPACITY(m_strDoc); #if defined(MCD_STRINSERTREPLACE) // STL, replace method if ( nNewLength > nAllocLen ) MCD_BLDRESERVE( m_strDoc, (nNewLength + nNewLength/2 + 128) ); MCD_STRINSERTREPLACE( m_strDoc, nLeft, nReplace, strInsert ); #else // MFC, no replace method int nBufferLen = nNewLength; if ( nNewLength > nAllocLen ) nBufferLen += nBufferLen/2 + 128; MCD_CHAR* pDoc = MCD_GETBUFFER( m_strDoc, nBufferLen ); if ( nInsLength != nReplace && nLeft+nReplace < nDocLength ) memmove( &pDoc[nLeft+nInsLength], &pDoc[nLeft+nReplace], (nDocLength-nLeft-nReplace)*sizeof(MCD_CHAR) ); memcpy( &pDoc[nLeft], strInsert, nInsLength*sizeof(MCD_CHAR) ); MCD_RELEASEBUFFER( m_strDoc, pDoc, nNewLength ); #endif // MFC, no replace method } void CMarkup::x_Adjust( int iPos, int nShift, bool bAfterPos /*=false*/ ) { // Loop through affected elements and adjust indexes // Algorithm: // 1. update children unless bAfterPos // (if no children or bAfterPos is true, length of iPos not affected) // 2. update starts of next siblings and their children // 3. go up until there is a next sibling of a parent and update starts // 4. step 2 int iPosTop = m_aPos[iPos].iElemParent; bool bPosFirst = bAfterPos; // mark as first to skip its children // Stop when we've reached the virtual parent (which has no tags) while ( m_aPos[iPos].StartTagLen() ) { // Were we at containing parent of affected position? bool bPosTop = false; if ( iPos == iPosTop ) { // Move iPosTop up one towards root iPosTop = m_aPos[iPos].iElemParent; bPosTop = true; } // Traverse to the next update position if ( ! bPosTop && ! bPosFirst && m_aPos[iPos].iElemChild ) { // Depth first iPos = m_aPos[iPos].iElemChild; } else if ( m_aPos[iPos].iElemNext ) { iPos = m_aPos[iPos].iElemNext; } else { // Look for next sibling of a parent of iPos // When going back up, parents have already been done except iPosTop while ( 1 ) { iPos = m_aPos[iPos].iElemParent; if ( iPos == iPosTop ) break; if ( m_aPos[iPos].iElemNext ) { iPos = m_aPos[iPos].iElemNext; break; } } } bPosFirst = false; // Shift indexes at iPos if ( iPos != iPosTop ) m_aPos[iPos].nStart += nShift; else m_aPos[iPos].nLength += nShift; } } int CMarkup::x_InsertNew( int iPosParent, int& iPosRel, CMarkup::NodePos& node ) { // Parent empty tag or tags with no content? bool bEmptyParentTag = iPosParent && m_aPos[iPosParent].IsEmptyElement(); bool bNoContentParentTags = iPosParent && ! m_aPos[iPosParent].ContentLen(); if ( node.nLength ) { // Located at a non-element node if ( ! (node.nNodeFlags & MNF_INSERT) ) node.nStart += node.nLength; } else if ( iPosRel ) { // Located at an element node.nStart = m_aPos[iPosRel].nStart; if ( ! (node.nNodeFlags & MNF_INSERT) ) // follow iPosRel node.nStart += m_aPos[iPosRel].nLength; } else if ( bEmptyParentTag ) { // Parent has no separate end tag, so split empty element if ( m_aPos[iPosParent].nFlags & MNF_NONENDED ) node.nStart = m_aPos[iPosParent].StartContent(); else node.nStart = m_aPos[iPosParent].StartContent() - 1; } else { if ( node.nNodeFlags & (MNF_INSERT|MNF_REPLACE) ) node.nStart = m_aPos[iPosParent].StartContent(); else // before end tag node.nStart = m_aPos[iPosParent].StartAfter() - m_aPos[iPosParent].EndTagLen(); } // Go up to start of next node, unless its splitting an empty element if ( ! (node.nNodeFlags&(MNF_WITHNOLINES|MNF_REPLACE)) && ! bEmptyParentTag ) { MCD_PCSZ pDoc = MCD_2PCSZ(m_strDoc); int nChar = node.nStart; if ( ! x_FindAny(pDoc,nChar) || pDoc[nChar] == '<' ) node.nStart = nChar; } // Is insert relative to element position? (i.e. not other kind of node) if ( ! node.nLength ) { // Modify iPosRel to reflect position before if ( iPosRel ) { if ( node.nNodeFlags & MNF_INSERT ) { if ( ! (m_aPos[iPosRel].nFlags & MNF_FIRST) ) iPosRel = m_aPos[iPosRel].iElemPrev; else iPosRel = 0; } } else if ( ! (node.nNodeFlags & MNF_INSERT) ) { // If parent has a child, add after last child if ( m_aPos[iPosParent].iElemChild ) iPosRel = m_aPos[m_aPos[iPosParent].iElemChild].iElemPrev; } } // Get node length (used only by x_AddNode) node.nLength = MCD_STRLENGTH(node.strMeta); // Prepare end of lines if ( (! (node.nNodeFlags & MNF_WITHNOLINES)) && (bEmptyParentTag || bNoContentParentTags) ) node.nStart += x_EOLLEN; if ( ! (node.nNodeFlags & MNF_WITHNOLINES) ) node.strMeta += x_EOL; // Calculate insert offset and replace length int nReplace = 0; int nInsertAt = node.nStart; if ( bEmptyParentTag ) { MCD_STR strTagName = x_GetTagName( iPosParent ); MCD_STR strFormat; if ( node.nNodeFlags & MNF_WITHNOLINES ) strFormat = MCD_T(">"); else strFormat = MCD_T(">") x_EOL; strFormat += node.strMeta; strFormat += MCD_T("value or // ElemPos* pElem = &m_aPos[iPos]; int nLenName = MCD_PSZLEN(pName); if ( ! pValue || ! pValue[0] ) { // empty element node.strMeta = MCD_T("<"); node.strMeta += pName; if ( nFlags & MNF_WITHNOEND ) { node.strMeta += MCD_T(">"); pElem->SetStartTagLen( nLenName + 2 ); pElem->nLength = nLenName + 2; } else { if ( nFlags & MNF_WITHXHTMLSPACE ) { node.strMeta += MCD_T(" />"); pElem->SetStartTagLen( nLenName + 4 ); pElem->nLength = nLenName + 4; } else { node.strMeta += MCD_T("/>"); pElem->SetStartTagLen( nLenName + 3 ); pElem->nLength = nLenName + 3; } } pElem->SetEndTagLen( 0 ); } else { // value MCD_STR strValue; if ( nFlags & MNF_WITHCDATA ) strValue = x_EncodeCDATASection( pValue ); else strValue = EscapeText( pValue, nFlags ); int nLenValue = MCD_STRLENGTH(strValue); node.strMeta = MCD_T("<"); node.strMeta += pName; node.strMeta += MCD_T(">"); node.strMeta += strValue; node.strMeta += MCD_T(""); pElem->SetEndTagLen( nLenName + 3 ); pElem->nLength = nLenName * 2 + nLenValue + 5; pElem->SetStartTagLen( nLenName + 2 ); } // Insert int nReplace = x_InsertNew( iPosParent, iPosBefore, node ); pElem->nStart = node.nStart; pElem->iElemChild = 0; if ( nFlags & MNF_WITHNOEND ) pElem->nFlags = MNF_NONENDED; else pElem->nFlags = 0; x_LinkElem( iPosParent, iPosBefore, iPos ); x_Adjust( iPos, MCD_STRLENGTH(node.strMeta) - nReplace ); if ( nFlags & MNF_CHILD ) x_SetPos( m_iPosParent, iPosParent, iPos ); else x_SetPos( iPosParent, iPos, 0 ); return true; } MCD_STR CMarkup::x_GetSubDoc( int iPos ) const { if ( iPos ) { int nStart = m_aPos[iPos].nStart; int nNext = nStart + m_aPos[iPos].nLength; MCD_PCSZ pDoc = MCD_2PCSZ(m_strDoc); int nChar = nNext; if ( ! x_FindAny(pDoc,nChar) || pDoc[nChar] == '<' ) nNext = nChar; return MCD_STRMID( m_strDoc, nStart, nNext - nStart ); } return MCD_T(""); } bool CMarkup::x_AddSubDoc( MCD_PCSZ pSubDoc, int nFlags ) { // Add subdocument, parse, and modify positions of affected elements // NodePos node( nFlags ); int iPosParent, iPosBefore; if ( nFlags & MNF_CHILD ) { // Add a subdocument under main position, before or after child if ( ! m_iPos ) return false; iPosParent = m_iPos; iPosBefore = m_iPosChild; } else { // Add a subdocument under parent position, before or after main iPosParent = m_iPosParent; iPosBefore = m_iPos; node.nStart = m_nNodeOffset; node.nLength = m_nNodeLength; } // Parse subdocument bool bWellFormed = true; TokenPos token( pSubDoc, m_nDocFlags ); int iPosVirtual = x_GetFreePos(); m_aPos[iPosVirtual].ClearVirtualParent(); m_aPos[iPosVirtual].SetLevel( m_aPos[iPosParent].Level() + 1 ); int iPos = x_ParseElem( iPosVirtual, token ); if ( (!iPos) || m_aPos[iPosVirtual].nFlags & MNF_ILLFORMED ) bWellFormed = false; if ( m_aPos[iPosVirtual].nFlags & MNF_ILLDATA ) m_aPos[iPosParent].nFlags |= MNF_ILLDATA; // Extract subdocument without leading/trailing nodes int nExtractStart = 0; int iPosLast = m_aPos[iPos].iElemPrev; if ( bWellFormed ) { nExtractStart = m_aPos[iPos].nStart; int nExtractLength = m_aPos[iPos].nLength; if ( iPos != iPosLast ) { nExtractLength = m_aPos[iPosLast].nStart - nExtractStart + m_aPos[iPosLast].nLength; bWellFormed = false; // treat as subdoc here, but return not well-formed } MCD_STRASSIGN(node.strMeta,&pSubDoc[nExtractStart],nExtractLength); } else { node.strMeta = pSubDoc; node.nNodeFlags |= MNF_WITHNOLINES; } // Insert int nReplace = x_InsertNew( iPosParent, iPosBefore, node ); // Adjust and link in the inserted elements // iPosVirtual will stop it from affecting rest of document int nAdjust = node.nStart - nExtractStart; if ( iPos && nAdjust ) { x_Adjust( iPos, nAdjust ); m_aPos[iPos].nStart += nAdjust; } int iPosChild = iPos; while ( iPosChild ) { int iPosNext = m_aPos[iPosChild].iElemNext; x_LinkElem( iPosParent, iPosBefore, iPosChild ); iPosBefore = iPosChild; iPosChild = iPosNext; } x_ReleasePos( iPosVirtual ); // Now adjust remainder of document x_Adjust( iPosLast, MCD_STRLENGTH(node.strMeta) - nReplace, true ); // Set position to top element of subdocument if ( nFlags & MNF_CHILD ) x_SetPos( m_iPosParent, iPosParent, iPos ); else // Main x_SetPos( m_iPosParent, iPos, 0 ); return bWellFormed; } int CMarkup::x_RemoveElem( int iPos ) { // Remove element and all contained elements // Return new position // if ( ! iPos ) return 0; // Determine whether any whitespace up to next tag int nAfterEnd = m_aPos[iPos].StartAfter(); MCD_PCSZ pDoc = MCD_2PCSZ(m_strDoc); int nChar = nAfterEnd; if ( ! x_FindAny(pDoc,nChar) || pDoc[nChar] == '<' ) nAfterEnd = nChar; // Remove from document, adjust affected indexes, and unlink int nLen = nAfterEnd - m_aPos[iPos].nStart; x_DocChange( m_aPos[iPos].nStart, nLen, MCD_STR() ); x_Adjust( iPos, - nLen, true ); int iPosPrev = x_UnlinkElem( iPos ); x_CheckSavedPos(); return iPosPrev; } void CMarkup::x_LinkElem( int iPosParent, int iPosBefore, int iPos ) { // Link in element, and initialize nFlags, and iElem indexes ElemPos* pElem = &m_aPos[iPos]; pElem->iElemParent = iPosParent; if ( iPosBefore ) { // Link in after iPosBefore pElem->nFlags &= ~MNF_FIRST; pElem->iElemNext = m_aPos[iPosBefore].iElemNext; if ( pElem->iElemNext ) m_aPos[pElem->iElemNext].iElemPrev = iPos; else m_aPos[m_aPos[iPosParent].iElemChild].iElemPrev = iPos; m_aPos[iPosBefore].iElemNext = iPos; pElem->iElemPrev = iPosBefore; } else { // Link in as first child pElem->nFlags |= MNF_FIRST; if ( m_aPos[iPosParent].iElemChild ) { pElem->iElemNext = m_aPos[iPosParent].iElemChild; pElem->iElemPrev = m_aPos[pElem->iElemNext].iElemPrev; m_aPos[pElem->iElemNext].iElemPrev = iPos; m_aPos[pElem->iElemNext].nFlags ^= MNF_FIRST; } else { pElem->iElemNext = 0; pElem->iElemPrev = iPos; } m_aPos[iPosParent].iElemChild = iPos; } if ( iPosParent ) pElem->SetLevel( m_aPos[iPosParent].Level() + 1 ); } int CMarkup::x_UnlinkElem( int iPos ) { // Fix links to remove element and mark as deleted // return previous position or zero if none ElemPos* pElem = &m_aPos[iPos]; // Find previous sibling and bypass removed element int iPosPrev = 0; if ( pElem->nFlags & MNF_FIRST ) { if ( pElem->iElemNext ) // set next as first child { m_aPos[pElem->iElemParent].iElemChild = pElem->iElemNext; m_aPos[pElem->iElemNext].iElemPrev = pElem->iElemPrev; m_aPos[pElem->iElemNext].nFlags |= MNF_FIRST; } else // no children remaining m_aPos[pElem->iElemParent].iElemChild = 0; } else { iPosPrev = pElem->iElemPrev; m_aPos[iPosPrev].iElemNext = pElem->iElemNext; if ( pElem->iElemNext ) m_aPos[pElem->iElemNext].iElemPrev = iPosPrev; else m_aPos[m_aPos[pElem->iElemParent].iElemChild].iElemPrev = iPosPrev; } x_ReleaseSubDoc( iPos ); return iPosPrev; } int CMarkup::x_ReleasePos( int iPos ) { int iPosNext = m_aPos[iPos].iElemNext; m_aPos[iPos].iElemNext = m_iPosDeleted; m_aPos[iPos].nFlags = MNF_DELETED; m_iPosDeleted = iPos; return iPosNext; } int CMarkup::x_ReleaseSubDoc( int iPos ) { // Mark position structures as deleted by depth first traversal // Tricky because iElemNext used in traversal is overwritten for linked list of deleted // Return value is what iElemNext was before being overwritten // int iPosNext = 0, iPosTop = iPos; while ( 1 ) { if ( m_aPos[iPos].iElemChild ) iPos = m_aPos[iPos].iElemChild; else { while ( 1 ) { iPosNext = x_ReleasePos( iPos ); if ( iPosNext || iPos == iPosTop ) break; iPos = m_aPos[iPos].iElemParent; } if ( iPos == iPosTop ) break; iPos = iPosNext; } } return iPosNext; } bool CMarkup::x_GetMap( SavedPosMap*& pMap, int nMap, int nMapSize /*=7*/ ) { // Find or create map, returns true if map(s) created SavedPosMap** ppMaps = m_SavedPosMapArray.pMaps; int nMapIndex = 0; if ( ppMaps ) { // Length of array is unknown, so loop through maps while ( nMapIndex <= nMap ) { pMap = ppMaps[nMapIndex]; if ( ! pMap ) break; if ( nMapIndex == nMap ) return false; // not created ++nMapIndex; } nMapIndex = 0; } // Create map(s) // If you access map 1 before map 0 created, then 2 maps will be created m_SavedPosMapArray.pMaps = new SavedPosMap*[nMap+2]; if ( ppMaps ) { while ( ppMaps[nMapIndex] ) { m_SavedPosMapArray.pMaps[nMapIndex] = ppMaps[nMapIndex]; ++nMapIndex; } delete[] ppMaps; } ppMaps = m_SavedPosMapArray.pMaps; while ( nMapIndex <= nMap ) { ppMaps[nMapIndex] = new SavedPosMap( nMapSize ); ++nMapIndex; } ppMaps[nMapIndex] = NULL; pMap = ppMaps[nMap]; return true; // map(s) created } void CMarkup::x_CheckSavedPos() { // Remove any saved positions now pointing to deleted elements // Must be done as part of element removal before position reassigned if ( m_SavedPosMapArray.pMaps ) { int nMap = 0; while ( m_SavedPosMapArray.pMaps[nMap] ) { SavedPosMap* pMap = m_SavedPosMapArray.pMaps[nMap]; for ( int nSlot = 0; nSlot < pMap->nMapSize; ++nSlot ) { SavedPos* pSavedPos = pMap->pTable[nSlot]; if ( pSavedPos ) { int nOffset = 0; int nSavedPosCount = 0; while ( 1 ) { if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED ) { int iPos = pSavedPos[nOffset].iPos; if ( ! (m_aPos[iPos].nFlags & MNF_DELETED) ) { if ( nSavedPosCount < nOffset ) { pSavedPos[nSavedPosCount] = pSavedPos[nOffset]; pSavedPos[nSavedPosCount].nSavedPosFlags &= ~SavedPos::SPM_LAST; } ++nSavedPosCount; } } if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST ) { while ( nSavedPosCount <= nOffset ) pSavedPos[nSavedPosCount++].nSavedPosFlags &= ~SavedPos::SPM_USED; break; } ++nOffset; } } } ++nMap; } } } void CMarkup::x_AdjustForNode( int iPosParent, int iPos, int nShift ) { // Adjust affected indexes bool bAfterPos = true; if ( ! iPos ) { // Change happened before or at first element under iPosParent // If there are any children of iPosParent, adjust from there // otherwise start at parent and adjust from there iPos = m_aPos[iPosParent].iElemChild; if ( iPos ) { m_aPos[iPos].nStart += nShift; bAfterPos = false; } else { iPos = iPosParent; m_aPos[iPos].nLength += nShift; } } x_Adjust( iPos, nShift, bAfterPos ); } bool CMarkup::x_AddNode( int nNodeType, MCD_PCSZ pText, int nNodeFlags ) { // Only comments, DTDs, and processing instructions are followed by CRLF // Other nodes are usually concerned with mixed content, so no CRLF if ( ! (nNodeType & (MNT_PROCESSING_INSTRUCTION|MNT_COMMENT|MNT_DOCUMENT_TYPE)) ) nNodeFlags |= MNF_WITHNOLINES; // Add node of nNodeType after current node position NodePos node( nNodeFlags ); if ( ! x_CreateNode(node.strMeta, nNodeType, pText) ) return false; // Locate where to add node relative to current node int iPosBefore = m_iPos; int iPosParent = m_iPosParent; node.nStart = m_nNodeOffset; node.nLength = m_nNodeLength; node.nNodeType = nNodeType; int nReplace = x_InsertNew( iPosParent, iPosBefore, node ); // If its a new element, create an ElemPos int iPos = iPosBefore; if ( nNodeType == MNT_ELEMENT ) { // Set indexes iPos = x_GetFreePos(); ElemPos* pElem = &m_aPos[iPos]; pElem->nStart = node.nStart; pElem->SetStartTagLen( node.nLength ); pElem->SetEndTagLen( 0 ); pElem->nLength = node.nLength; node.nStart = 0; node.nLength = 0; pElem->iElemChild = 0; pElem->nFlags = 0; x_LinkElem( iPosParent, iPosBefore, iPos ); } // Need to adjust element positions after iPos x_AdjustForNode( iPosParent, iPos, MCD_STRLENGTH(node.strMeta) - nReplace ); // Set current position m_iPos = iPos; m_iPosChild = 0; m_nNodeOffset = node.nStart; m_nNodeLength = node.nLength; m_nNodeType = nNodeType; MARKUP_SETDEBUGSTATE; return true; } void CMarkup::x_RemoveNode( int iPosParent, int& iPos, int& nNodeType, int& nNodeOffset, int& nNodeLength ) { // Remove node and return new position // int iPosPrev = iPos; // Removing an element? if ( nNodeType == MNT_ELEMENT ) { nNodeOffset = m_aPos[iPos].nStart; nNodeLength = m_aPos[iPos].nLength; iPosPrev = x_UnlinkElem( iPos ); x_CheckSavedPos(); } // Find previous node type, offset and length int nPrevOffset = 0; if ( iPosPrev ) nPrevOffset = m_aPos[iPosPrev].StartAfter(); else if ( iPosParent ) nPrevOffset = m_aPos[iPosParent].StartContent(); TokenPos token( m_strDoc, m_nDocFlags ); NodePos node; token.nNext = nPrevOffset; int nPrevType = 0; while ( token.nNext < nNodeOffset ) { nPrevOffset = token.nNext; nPrevType = x_ParseNode( token, node ); } int nPrevLength = nNodeOffset - nPrevOffset; if ( ! nPrevLength ) { // Previous node is iPosPrev element nPrevOffset = 0; if ( iPosPrev ) nPrevType = MNT_ELEMENT; } // Remove node from document x_DocChange( nNodeOffset, nNodeLength, MCD_STR() ); x_AdjustForNode( iPosParent, iPosPrev, - nNodeLength ); // Was removed node a lone end tag? if ( nNodeType == MNT_LONE_END_TAG ) { // See if we can unset parent MNF_ILLDATA flag token.nNext = m_aPos[iPosParent].StartContent(); int nEndOfContent = token.nNext + m_aPos[iPosParent].ContentLen(); int iPosChild = m_aPos[iPosParent].iElemChild; while ( token.nNext < nEndOfContent ) { if ( x_ParseNode(token,node) <= 0 ) break; if ( node.nNodeType == MNT_ELEMENT ) { token.nNext = m_aPos[iPosChild].StartAfter(); iPosChild = m_aPos[iPosChild].iElemNext; } } if ( token.nNext == nEndOfContent ) m_aPos[iPosParent].nFlags &= ~MNF_ILLDATA; } nNodeType = nPrevType; nNodeOffset = nPrevOffset; nNodeLength = nPrevLength; iPos = iPosPrev; } // Encoding names // This is a precompiled ASCII hash table for speed and minimum memory requirement // Each entry consists of a 2 digit name length, 5 digit code page, and the encoding name // Each table slot can have multiple entries, table size 150 was chosen for even distribution // MCD_PCSZ EncodingNameTable[150] = { MCD_T("2701148x-ebcdic-international-euro1028599iso_8859-9"), MCD_T("0828596ecma-1141420284x-ebcdic-spain"), MCD_T("0751949cseuckr1420127ansi_x3.4-19681420127ansi_x3.4-1986"),MCD_T(""), MCD_T("1410001x-mac-japanese"),MCD_T("0500936cn-gb0828597ecma-118"), MCD_T("0600932x-sjis"),MCD_T("0228591l1"), MCD_T("0228592l20300936gbk0300437437"),MCD_T("1420280x-ebcdic-italy0228593l3"), MCD_T("0628596arabic0228594l42701142x-ebcdic-denmarknorway-euro"), MCD_T("0228595l5"),MCD_T(""),MCD_T("0700950cn-big50900932shift-jis"),MCD_T(""), MCD_T("0800936csgb23120228605l9"),MCD_T(""),MCD_T(""),MCD_T("0920127iso646-us"), MCD_T("0600850ibm850"),MCD_T("0800949ksc_56010600437ibm437"), MCD_T("0600852ibm8520600861ibm861"),MCD_T("0620127ibm367"), MCD_T("1420002x-chinese-eten0320866koi0600737ibm737"), MCD_T("1410007x-mac-cyrillic1601146x-ebcdic-uk-euro1320107x-ia5-swedish0628591ibm819"), MCD_T("0600775ibm775"), MCD_T("0600866ibm8662750937x-ebcdic-traditionalchinese0600857ibm8571020127iso-ir-6us"), MCD_T("1710002x-mac-chinesetrad"),MCD_T(""),MCD_T("0600869ibm869"),MCD_T(""), MCD_T("0500437cspc8"),MCD_T("0551932x-euc1250221_iso-2022-jp1000932csshiftjis"), MCD_T("2000875x-ebcdic-greekmodern3350939x-ebcdic-japaneseandjapaneselatin0601200utf-16"), MCD_T("1320838x-ebcdic-thai"),MCD_T("1028591iso-ir-100"), MCD_T("0800708asmo-7081028594iso-ir-1101028592iso-ir-101"), MCD_T("0628598hebrew"),MCD_T("1057003x-iscii-be"), MCD_T("1901145x-ebcdic-spain-euro1620127iso_646.irv:1991"), MCD_T("1057002x-iscii-de0600949korean"), MCD_T("1628596csisolatinarabic1710008x-mac-chinesesimp"),MCD_T(""), MCD_T("1028595iso-ir-1440520866koi8r1028597iso-ir-1261057008x-iscii-ka"), MCD_T("1028596iso-ir-1272420880x-ebcdic-cyrillicrussian1028593iso-ir-109"), MCD_T("1000936gb_2312-801057009x-iscii-ma1128605iso_8859-15"), MCD_T("1028598iso-ir-138"), MCD_T("1901144x-ebcdic-italy-euro1028599iso-ir-1480928591iso8859-1"), MCD_T("0928592iso8859-21057011x-iscii-pa1000949iso-ir-1490820127us-ascii"), MCD_T("1220106x-ia5-german"),MCD_T(""), MCD_T("2320833x-ebcdic-koreanextended1057006x-iscii-as"), MCD_T("1057004x-iscii-ta"),MCD_T("1000932x-ms-cp932"),MCD_T(""), MCD_T("0800936gb2312800721866koi8-ru"), MCD_T("1057005x-iscii-te1300949csksc56011987"),MCD_T(""),MCD_T(""), MCD_T("0400950big52650935x-ebcdic-simplifiedchinese1057010x-iscii-gu"), MCD_T("0500775cp5000628598visual3321025x-ebcdic-cyrillicserbianbulgarian"), MCD_T("1150221csiso2022jp1765000unicode-1-1-utf-7"), MCD_T("1765001unicode-1-1-utf-81765001unicode-2-0-utf-8"), MCD_T("4551932extended_unix_code_packed_format_for_japanese0900932shift_jis"), MCD_T("1057007x-iscii-or1150225csiso2022kr"),MCD_T(""), MCD_T("1951932cseucpkdfmtjapanese0501361johab0910000macintosh"), MCD_T("0829001x-europa1620273x-ebcdic-germany"), MCD_T("1628598csisolatinhebrew"),MCD_T("1320000x-chinese-cns0500437cp437"), MCD_T("0500852cp8520500870cp870"),MCD_T("0520127ascii0520127cp367"), MCD_T("1500420x-ebcdic-arabic"),MCD_T("0528591cp819"),MCD_T("0700720dos-720"), MCD_T("1210004x-mac-arabic0500866cp8660700949ksc5601"), MCD_T("0528597greek1965000x-unicode-2-0-utf-7"), MCD_T("1400949ks_c_5601-19871965001x-unicode-2-0-utf-8"),MCD_T(""), MCD_T("0420866koi82520290x-ebcdic-japanesekatakana1400949ks_c_5601-1989"), MCD_T(""),MCD_T("1128592csisolatin20700862dos-862"), MCD_T("1100437codepage4370201252us"),MCD_T("1128594csisolatin40700874tis-620"), MCD_T("1128595csisolatin50700874dos-874"), MCD_T("1501143x-ebcdic-france1200037ebcdic-cp-us1520108x-ia5-norwegian"), MCD_T("1820873x-ebcdic-icelandic"), MCD_T("1150220iso-2022-jp1100874windows-874"),MCD_T("0620866koi8-r"), MCD_T("0800950x-x-big51510079x-mac-icelandic"), MCD_T("2220278x-ebcdic-finlandsweden2850933x-ebcdic-japaneseanduscanada1620905x-ebcdic-turkish1150225iso-2022-kr"), MCD_T("0621866koi8-u"),MCD_T("1028591iso-8859-1"), MCD_T("1028592iso-8859-21310081x-mac-turkish"), MCD_T("0828597elot_9280800932ms_kanji1028593iso-8859-30801252iso-ir-6"), MCD_T("1028594iso-8859-40528593csiso1565000csunicode11utf7"), MCD_T("1028595iso-8859-5"),MCD_T("1028596iso-8859-6"), MCD_T("1028597iso-8859-73250933x-ebcdic-koreanandkoreanextended"), MCD_T("1238598iso-8859-8-i1028598iso-8859-81520424x-ebcdic-hebrew1650222_iso-2022-jp$sio"), MCD_T("0900936gb2312-801028599iso-8859-91101201unicodefffe"), MCD_T("0600936gb2312"),MCD_T("1210005x-mac-hebrew"),MCD_T(""),MCD_T(""), MCD_T("2101143x-ebcdic-germany-euro1210003x-mac-korean"),MCD_T(""), MCD_T("0720866cskoi8r1528597csisolatingreek"),MCD_T(""), MCD_T("1500936csiso58gb2312800828595cyrillic"),MCD_T(""),MCD_T(""), MCD_T("0601026cp1026"),MCD_T(""),MCD_T(""), MCD_T("0810029x-mac-ce0900949ks_c_5601"),MCD_T(""), MCD_T("0601256cp12561052936hz-gb-23121528591iso_8859-1:1987"), MCD_T("1528592iso_8859-2:1987"), MCD_T("1000936csgb2312802450930x-ebcdic-japaneseandkana"), MCD_T("1528593iso_8859-3:19880520105x-ia5"), MCD_T("1528594iso_8859-4:19880651936euc-cn"), MCD_T("1528596iso_8859-6:19871528595iso_8859-5:19882220277x-ebcdic-denmarknorway"), MCD_T("0600950csbig51528597iso_8859-7:19872001147x-ebcdic-france-euro"), MCD_T("1201250windows-12502301149x-ebcdic-icelandic-euro"), MCD_T("1201251windows-12511528598iso_8859-8:1988"), MCD_T("0801250x-cp12501201252windows-1252"), MCD_T("0801251x-cp12511201253windows-12531400949ks_c_5601_19871528599iso_8859-9:1989"), MCD_T("2701143x-ebcdic-finlandsweden-euro1201254windows-1254"), MCD_T("1201255windows-1255"),MCD_T("1201256windows-12560651932euc-jp"), MCD_T("1201257windows-12570738598logical"), MCD_T("0628597greek81201258windows-1258"),MCD_T("0600949euc-kr"),MCD_T(""), MCD_T("0700936chinese0565000utf-70720127csascii0628591latin1"), MCD_T("0628592latin20851936x-euc-cn1120285x-ebcdic-uk0565001utf-8"), MCD_T("0628593latin3"), MCD_T("0628594latin41420423x-ebcdic-greek1901140x-ebcdic-cp-us-euro"), MCD_T("0628599latin5"),MCD_T("1828595csisolatincyrillic"), MCD_T("1110006x-mac-greek1100874iso-8859-11"), MCD_T("1028591iso_8859-10601252x-ansi"), MCD_T("1028592iso_8859-20628605latin90701200unicode"), MCD_T("1028593iso_8859-3"), MCD_T("1028594iso_8859-40851932x-euc-jp1128605iso-8859-15"), MCD_T("1028595iso_8859-5"),MCD_T("1028596iso_8859-6"), MCD_T("1028597iso_8859-7"), MCD_T("0900936iso-ir-581028598iso_8859-81201255iso_8859-8-i1200932cswindows31j") }; int CMarkup::x_GetEncodingCodePage( MCD_CSTR pszEncoding ) { // redo for completeness, the iconv set, UTF-32, and uppercase // Lookup strEncoding in EncodingNameTable and return Windows code page int nCodePage = -1; int nEncLen = MCD_PSZLEN( pszEncoding ); if ( ! nEncLen ) nCodePage = MCD_ACP; else if ( MCD_PSZNCMP(pszEncoding,MCD_T("UTF-32"),6) == 0 ) nCodePage = MCD_UTF32; else if ( nEncLen < 100 ) { MCD_CHAR szEncodingLower[100]; for ( int nEncChar=0; nEncChar='A' && cEncChar<='Z')? (MCD_CHAR)(cEncChar+('a'-'A')) : cEncChar; } szEncodingLower[nEncLen] = '\0'; MCD_PCSZ pEntry = EncodingNameTable[x_Hash(szEncodingLower,sizeof(EncodingNameTable)/sizeof(MCD_PCSZ))]; while ( *pEntry ) { // e.g. entry: 0565001utf-8 means length 05, code page 65001, encoding name utf-8 int nEntryLen = (*pEntry - '0') * 10; ++pEntry; nEntryLen += (*pEntry - '0'); ++pEntry; MCD_PCSZ pCodePage = pEntry; pEntry += 5; if ( nEntryLen == nEncLen && MCD_PSZNCMP(szEncodingLower,pEntry,nEntryLen) == 0 ) { // Convert digits to integer up to code name which always starts with alpha nCodePage = MCD_PSZTOL( pCodePage, NULL, 10 ); break; } pEntry += nEntryLen; } } return nCodePage; } #if ! defined(MARKUP_WCHAR) bool CMarkup::x_CanConvert( MCD_CSTR pszToEncoding, MCD_CSTR pszFromEncoding ) { // Return true if MB to MB conversion is possible #if defined(MARKUP_ICONV) // iconv_open should fail if either encoding not supported or one is alias for other char szTo[100], szFrom[100]; iconv_t cd = iconv_open( x_IConvName(szTo,pszToEncoding), x_IConvName(szFrom,pszFromEncoding) ); if ( cd == (iconv_t)-1 ) return false; iconv_close(cd); #else int nToCP = x_GetEncodingCodePage( pszToEncoding ); int nFromCP = x_GetEncodingCodePage( pszFromEncoding ); if ( nToCP == -1 || nFromCP == -1 ) return false; #if defined(MARKUP_WINCONV) if ( nToCP == MCD_ACP || nFromCP == MCD_ACP ) // either ACP ANSI? { int nACP = GetACP(); if ( nToCP == MCD_ACP ) nToCP = nACP; if ( nFromCP == MCD_ACP ) nFromCP = nACP; } #else // no conversion API, but we can do AToUTF8 and UTF8ToA if ( nToCP != MCD_UTF8 && nFromCP != MCD_UTF8 ) // either UTF-8? return false; #endif // no conversion API if ( nToCP == nFromCP ) return false; #endif // not ICONV return true; } #endif // not WCHAR #if defined(MARKUP_ICONV) const char* CMarkup::x_IConvName( char* szEncoding, MCD_CSTR pszEncoding ) { // Make upper case char-based name from strEncoding which consists only of characters in the ASCII range int nEncChar = 0; while ( pszEncoding[nEncChar] ) { char cEncChar = (char)pszEncoding[nEncChar]; szEncoding[nEncChar] = (cEncChar>='a' && cEncChar<='z')? (cEncChar-('a'-'A')) : cEncChar; ++nEncChar; } szEncoding[nEncChar] = '\0'; return szEncoding; } int CMarkup::x_IConv( ConvertEncoding& convert, void* pTo, int nToCharSize, int nFromCharSize ) { // Converts from any MB/UTF-8 to MB/UTF-8 char szTo[100], szFrom[100]; iconv_t cd = iconv_open( x_IConvName(szTo,convert.strToEncoding), x_IConvName(szFrom,convert.strFromEncoding) ); int nToLenBytes = 0; if ( cd != (iconv_t)-1 ) { size_t nFromLenRemaining = (size_t)convert.nFromLen * nFromCharSize; size_t nToCountRemaining = (size_t)convert.nToCount * nToCharSize; size_t nToCountRemainingBefore; char* pToChar = (char*)pTo; char* pFromChar = (char*)convert.pFrom; char* pToTempBuffer = NULL; const size_t nTempBufferSize = 2048; size_t nResult; if ( ! pTo ) { pToTempBuffer = new char[nTempBufferSize]; pToChar = pToTempBuffer; nToCountRemaining = nTempBufferSize; } while ( nFromLenRemaining ) { nToCountRemainingBefore = nToCountRemaining; nResult = iconv( cd, &pFromChar, &nFromLenRemaining, &pToChar, &nToCountRemaining ); nToLenBytes += (int)(nToCountRemainingBefore - nToCountRemaining); if ( nResult == (size_t)-1 ) { // Bypass bad char, question mark denotes problem in source string pFromChar += nFromCharSize; nFromLenRemaining -= nFromCharSize; if ( nToCharSize == 1 ) *pToChar = '?'; else if ( nToCharSize == 2 ) *((unsigned short*)pToChar) = (unsigned short)'?'; else if ( nToCharSize == 4 ) *((unsigned int*)pToChar) = (unsigned int)'?'; pToChar += nToCharSize; nToCountRemaining -= nToCharSize; } else convert.nFailedChars += nResult; if ( pToTempBuffer && nToCountRemaining < 10 ) { nToCountRemaining = nTempBufferSize; pToChar = pToTempBuffer; } } if ( pToTempBuffer ) delete[] pToTempBuffer; iconv_close(cd); } return nToLenBytes / nToCharSize; } #endif int CMarkup::x_ConvertEncoding( ConvertEncoding& convert, void* pTo ) { // If pTo is not NULL, it must be large enough to hold result, length of result is returned // convert.nFailedChars will be set to >0 if characters not supported in strToEncoding int nToLen = 0; int nToCP = x_GetEncodingCodePage( convert.strToEncoding ); if ( nToCP == -1 ) nToCP = MCD_ACP; int nFromCP = x_GetEncodingCodePage( convert.strFromEncoding ); if ( nFromCP == -1 ) nFromCP = MCD_ACP; convert.nFailedChars = 0; #if ! defined(MARKUP_WINCONV) && ! defined(MARKUP_ICONV) // Only non-Unicode encoding supported is locale charset, must call setlocale if ( nToCP != MCD_UTF8 && nToCP != MCD_UTF16 && nToCP != MCD_UTF32 ) nToCP = MCD_ACP; if ( nFromCP != MCD_UTF8 && nFromCP != MCD_UTF16 && nFromCP != MCD_UTF32 ) nFromCP = MCD_ACP; if ( nFromCP == MCD_ACP ) { const char* pA = (const char*)convert.pFrom; int nALenRemaining = convert.nFromLen; int nCharLen; wchar_t wcChar; char* pU = (char*)pTo; while ( nALenRemaining ) { nCharLen = mbtowc( &wcChar, pA, nALenRemaining ); if ( nCharLen < 1 ) { wcChar = (wchar_t)'?'; nCharLen = 1; } pA += nCharLen; nALenRemaining -= nCharLen; if ( nToCP == MCD_UTF8 ) EncodeCharUTF8( (int)wcChar, pU, nToLen ); else if ( nToCP == MCD_UTF16 ) EncodeCharUTF16( (int)wcChar, (unsigned short*)pU, nToLen ); else // UTF32 { if ( pU ) ((unsigned int*)pU)[nToLen] = (unsigned int)wcChar; ++nToLen; } } } else if ( nToCP == MCD_ACP ) { union pUnicodeUnion { const char* p8; const unsigned short* p16; const unsigned int* p32; } pU; pU.p8 = (const char*)convert.pFrom; const char* pUEnd = pU.p8 + convert.nFromLen; if ( nFromCP == MCD_UTF16 ) pUEnd = (char*)( pU.p16 + convert.nFromLen ); else if ( nFromCP == MCD_UTF32 ) pUEnd = (char*)( pU.p32 + convert.nFromLen ); int nCharLen; char* pA = (char*)pTo; char szA[8]; int nUChar; while ( pU.p8 != pUEnd ) { if ( nFromCP == MCD_UTF8 ) nUChar = DecodeCharUTF8( pU.p8, pUEnd ); else if ( nFromCP == MCD_UTF16 ) nUChar = DecodeCharUTF16( pU.p16, (const unsigned short*)pUEnd ); else // UTF32 nUChar = *(pU.p32)++; if ( nUChar == -1 ) nCharLen = -2; else if ( nUChar & ~0xffff ) nCharLen = -1; else nCharLen = wctomb( pA?pA:szA, (wchar_t)nUChar ); if ( nCharLen < 0 ) { if ( nCharLen == -1 ) ++convert.nFailedChars; nCharLen = 1; if ( pA ) *pA = '?'; } if ( pA ) pA += nCharLen; nToLen += nCharLen; } } #endif // not WINCONV and not ICONV if ( nFromCP == MCD_UTF32 ) { const unsigned int* p32 = (const unsigned int*)convert.pFrom; const unsigned int* p32End = p32 + convert.nFromLen; if ( nToCP == MCD_UTF8 ) { char* p8 = (char*)pTo; while ( p32 != p32End ) EncodeCharUTF8( *p32++, p8, nToLen ); } else if ( nToCP == MCD_UTF16 ) { unsigned short* p16 = (unsigned short*)pTo; while ( p32 != p32End ) EncodeCharUTF16( (int)*p32++, p16, nToLen ); } else // to ANSI { // WINCONV not supported for 32To8, since only used for sizeof(wchar_t) == 4 #if defined(MARKUP_ICONV) nToLen = x_IConv( convert, pTo, 1, 4 ); #endif // ICONV } } else if ( nFromCP == MCD_UTF16 ) { // UTF16To8 will be deprecated since weird output buffer size sensitivity not worth implementing here const unsigned short* p16 = (const unsigned short*)convert.pFrom; const unsigned short* p16End = p16 + convert.nFromLen; int nUChar; if ( nToCP == MCD_UTF32 ) { unsigned int* p32 = (unsigned int*)pTo; while ( p16 != p16End ) { nUChar = DecodeCharUTF16( p16, p16End ); if ( nUChar == -1 ) nUChar = '?'; if ( p32 ) p32[nToLen] = (unsigned int)nUChar; ++nToLen; } } #if defined(MARKUP_WINCONV) else // to UTF-8 or other multi-byte { nToLen = WideCharToMultiByte(nToCP,0,(const wchar_t*)convert.pFrom,convert.nFromLen,(char*)pTo, convert.nToCount?convert.nToCount+1:0,NULL, (nToCP==CP_UTF8)?NULL:&convert.nFailedChars); // fails with lpUsedDefaultChar and CP_UTF8 } #else // not WINCONV else if ( nToCP == MCD_UTF8 ) { char* p8 = (char*)pTo; while ( p16 != p16End ) { nUChar = DecodeCharUTF16( p16, p16End ); if ( nUChar == -1 ) nUChar = '?'; EncodeCharUTF8( nUChar, p8, nToLen ); } } else // to ANSI { #if defined(MARKUP_ICONV) nToLen = x_IConv( convert, pTo, 1, 2 ); #endif // ICONV } #endif // not WINCONV } else if ( nToCP == MCD_UTF16 ) // to UTF-16 from UTF-8/ANSI { #if defined(MARKUP_WINCONV) nToLen = MultiByteToWideChar(nFromCP,0,(const char*)convert.pFrom,convert.nFromLen,(wchar_t*)pTo,convert.nToCount); #else // not WINCONV if ( nFromCP == MCD_UTF8 ) { const char* p8 = (const char*)convert.pFrom; const char* p8End = p8 + convert.nFromLen; int nUChar; unsigned short* p16 = (unsigned short*)pTo; while ( p8 != p8End ) { nUChar = DecodeCharUTF8( p8, p8End ); if ( nUChar == -1 ) nUChar = '?'; if ( p16 ) p16[nToLen] = (unsigned short)nUChar; ++nToLen; } } else // from ANSI { #if defined(MARKUP_ICONV) nToLen = x_IConv( convert, pTo, 2, 1 ); #endif // ICONV } #endif // not WINCONV } else if ( nToCP == MCD_UTF32 ) // to UTF-32 from UTF-8/ANSI { if ( nFromCP == MCD_UTF8 ) { const char* p8 = (const char*)convert.pFrom; const char* p8End = p8 + convert.nFromLen; int nUChar; unsigned int* p32 = (unsigned int*)pTo; while ( p8 != p8End ) { nUChar = DecodeCharUTF8( p8, p8End ); if ( nUChar == -1 ) nUChar = '?'; if ( p32 ) p32[nToLen] = (unsigned int)nUChar; ++nToLen; } } else // from ANSI { // WINCONV not supported for ATo32, since only used for sizeof(wchar_t) == 4 #if defined(MARKUP_ICONV) nToLen = x_IConv( convert, pTo, 4, 1 ); #endif // ICONV } } else { #if defined(MARKUP_ICONV) nToLen = x_IConv( convert, pTo, 1, 1 ); #elif defined(MARKUP_WINCONV) wchar_t* pwszUTF16 = new wchar_t[convert.nFromLen]; int nUTF16Len = MultiByteToWideChar(nFromCP,0,(const char*)convert.pFrom,convert.nFromLen,pwszUTF16,convert.nFromLen); nToLen = WideCharToMultiByte(nToCP,0,pwszUTF16,nUTF16Len,(char*)pTo,convert.nToCount,NULL, (nToCP==CP_UTF8)?NULL:&convert.nFailedChars); // cannot specify lpUsedDefaultChar if UTF-8! delete[] pwszUTF16; #endif // WINCONV } // Store the length in case this is called again after allocating output buffer to fit convert.nToCount = nToLen; return nToLen; } bool CMarkup::x_EndianSwapRequired( int nDocFlags ) { short nWord = 1; char cFirstByte = ((char*)&nWord)[0]; if ( cFirstByte ) // LE { if ( nDocFlags & MDF_UTF16BEFILE ) return true; } else if ( nDocFlags & MDF_UTF16LEFILE ) return true; return false; } void CMarkup::x_EndianSwapUTF16( unsigned short* pBuffer, int nCharLen ) { unsigned short cChar; while ( nCharLen-- ) { cChar = pBuffer[nCharLen]; pBuffer[nCharLen] = (unsigned short)((cChar<<8) | (cChar>>8)); } } struct BomTableStruct { const char* pszBom; int nBomLen; MCD_PCSZ pszBomEnc; int nBomFlag; } BomTable[] = { { "\xef\xbb\xbf", 3, MCD_T("UTF-8"), CMarkup::MDF_UTF8PREAMBLE }, { "\xff\xfe", 2, MCD_T("UTF-16LE"), CMarkup::MDF_UTF16LEFILE }, { "\xfe\xff", 2, MCD_T("UTF-16BE"), CMarkup::MDF_UTF16BEFILE }, { NULL,0,NULL,0 } }; bool CMarkup::x_Open( MCD_CSTR_FILENAME szFileName, FilePos& file ) { MCD_STRCLEAR( file.strIOResult ); // Open file MCD_PCSZ_FILENAME pMode = MCD_T_FILENAME("rb"); if ( file.nDocFlags & MDF_APPENDFILE ) pMode = MCD_T_FILENAME("ab"); else if ( file.nDocFlags & MDF_WRITEFILE ) pMode = MCD_T_FILENAME("wb"); file.fp = NULL; MCD_FOPEN( file.fp, szFileName, pMode ); if ( ! file.fp ) { file.strIOResult = x_GetLastError(); return false; } // Prepare file bool bSuccess = true; int nBomLen = 0; if ( file.nDocFlags & MDF_READFILE ) { // Get file length fseek( file.fp, 0, SEEK_END ); file.nFileByteLen = ftell( file.fp ); fseek( file.fp, 0, SEEK_SET ); // Read the top of the file to check BOM and encoding int nReadTop = 1024; if ( file.nFileByteLen < nReadTop ) nReadTop = file.nFileByteLen; if ( nReadTop ) { char* pFileTop = new char[nReadTop]; if ( nReadTop ) bSuccess = ( fread( pFileTop, nReadTop, 1, file.fp ) == 1 ); if ( bSuccess ) { // Check for Byte Order Mark (preamble) int nBomCheck = 0; file.nDocFlags &= ~( MDF_UTF16LEFILE | MDF_UTF8PREAMBLE ); file.nFileCharUnitSize = 1; // unless UTF-16 BOM found while ( BomTable[nBomCheck].pszBom ) { while ( nBomLen < BomTable[nBomCheck].nBomLen ) { if ( nBomLen >= nReadTop || pFileTop[nBomLen] != BomTable[nBomCheck].pszBom[nBomLen] ) break; ++nBomLen; } if ( nBomLen == BomTable[nBomCheck].nBomLen ) { file.nDocFlags |= BomTable[nBomCheck].nBomFlag; file.nFileByteLen -= nBomLen; if ( nBomLen == 2 ) file.nFileCharUnitSize = 2; file.strEncoding = BomTable[nBomCheck].pszBomEnc; break; } ++nBomCheck; nBomLen = 0; } if ( nReadTop > nBomLen ) fseek( file.fp, nBomLen, SEEK_SET ); // Encoding check if ( ! nBomLen ) { MCD_STR strDeclCheck; #if defined(MARKUP_WCHAR) // WCHAR ConvertEncoding convert( MCD_ENC, MCD_T("UTF-8"), (const void*)pFileTop, nReadTop ); MCD_CHAR* pWideBuffer = MCD_GETBUFFER(strDeclCheck,nReadTop); convert.nToCount = nReadTop; int nDeclWideLen = x_ConvertEncoding( convert, (void*)pWideBuffer ); MCD_RELEASEBUFFER(strDeclCheck,pWideBuffer,nDeclWideLen); #else // not WCHAR MCD_STRASSIGN(strDeclCheck,pFileTop,nReadTop); #endif // not WCHAR file.strEncoding = GetDeclaredEncoding( strDeclCheck ); } // Assume markup files starting with < sign are UTF-8 if otherwise unknown if ( MCD_STRISEMPTY(file.strEncoding) && pFileTop[0] == '<' ) file.strEncoding = MCD_T("UTF-8"); } delete [] pFileTop; } } else if ( file.nDocFlags & MDF_WRITEFILE ) { file.nFileCharUnitSize = 1; if ( file.nDocFlags & MDF_APPENDFILE ) file.nFileByteLen = ftell( file.fp ); if ( ! file.nFileByteLen ) { int nBomCheck = 0; while ( BomTable[nBomCheck].pszBom ) { if ( file.nDocFlags & BomTable[nBomCheck].nBomFlag ) { nBomLen = BomTable[nBomCheck].nBomLen; if ( nBomLen == 2 ) { file.nFileCharUnitSize = 2; file.strEncoding = BomTable[nBomCheck].pszBomEnc; } bSuccess = ( fwrite(BomTable[nBomCheck].pszBom,nBomLen,1,file.fp) == 1 ); break; } ++nBomCheck; } } } if ( ! bSuccess ) { file.strIOResult = x_GetLastError(); return false; } if ( ! MCD_STRISEMPTY(file.strEncoding) ) { file.strIOResult = file.strEncoding; if ( nBomLen ) file.strIOResult += MCD_T(" BOM +"); file.strIOResult += MCD_T(" "); } return bSuccess; } bool CMarkup::x_Read( void* pBuffer, FilePos& file ) { MCD_CHAR szReadInfo[100] = {0}; bool bSuccess = ( fread( pBuffer,file.nReadByteLen,1,file.fp) == 1 ); file.nFileTextLen = file.nReadByteLen / file.nFileCharUnitSize; if ( bSuccess ) { MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("length %d "), file.nFileTextLen ); file.strIOResult = szReadInfo; // Microsoft components can produce apparently valid docs with some nulls at ends of values int nNullCount = 0; int nNullCheckCharsRemaining = file.nFileTextLen; char* pAfterNull = NULL; char* pNullScan = (char*)pBuffer; bool bSingleByteChar = file.nFileCharUnitSize == 1; while ( nNullCheckCharsRemaining-- ) { if ( bSingleByteChar? (! *pNullScan) : (! (*(unsigned short*)pNullScan)) ) { if ( pAfterNull && pNullScan != pAfterNull ) memmove( pAfterNull - (nNullCount*file.nFileCharUnitSize), pAfterNull, pNullScan - pAfterNull ); pAfterNull = pNullScan + file.nFileCharUnitSize; ++nNullCount; } pNullScan += file.nFileCharUnitSize; } if ( pAfterNull && pNullScan != pAfterNull ) memmove( pAfterNull - (nNullCount*file.nFileCharUnitSize), pAfterNull, pNullScan - pAfterNull ); if ( nNullCount ) { MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("(%d nulls removed) "), nNullCount ); file.strIOResult += szReadInfo; file.nFileTextLen -= nNullCount; } // Big endian/little endian conversion if ( file.nFileCharUnitSize > 1 && x_EndianSwapRequired(file.nDocFlags) ) { x_EndianSwapUTF16( (unsigned short*)pBuffer, file.nFileTextLen ); file.strIOResult += MCD_T("endian swap "); } } if ( ! bSuccess ) file.strIOResult = x_GetLastError(); return bSuccess; } bool CMarkup::x_ReadText( MCD_STR& strDoc, FilePos& file ) { bool bSuccess = true; MCD_CHAR szReadInfo[100] = {0}; MCD_STRCLEAR( file.strIOResult ); if ( ! file.nReadByteLen ) { file.strIOResult = MCD_T("0 length "); return bSuccess; } if ( file.nDocFlags & (MDF_UTF16LEFILE | MDF_UTF16BEFILE) ) { int nUTF16Len = file.nReadByteLen / 2; #if defined(MARKUP_WCHAR) // WCHAR int nBufferSizeForGrow = nUTF16Len + nUTF16Len/100; // extra 1% #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4 unsigned short* pUTF16Buffer = new unsigned short[nUTF16Len+1]; bSuccess = x_Read( pUTF16Buffer, file ); if ( bSuccess ) { ConvertEncoding convert( MCD_T("UTF-32"), MCD_T("UTF-16"), (const void*)pUTF16Buffer, nUTF16Len ); convert.nToCount = nBufferSizeForGrow; MCD_CHAR* pUTF32Buffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); int nUTF32Len = x_ConvertEncoding( convert, (void*)pUTF32Buffer ); MCD_RELEASEBUFFER(strDoc,pUTF32Buffer,nUTF32Len); MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to UTF-32 length %d "), nUTF32Len ); } #else // sizeof(wchar_t) == 2 MCD_CHAR* pUTF16Buffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); bSuccess = x_Read( pUTF16Buffer, file ); MCD_RELEASEBUFFER(strDoc,pUTF16Buffer,file.nFileTextLen); #endif // sizeof(wchar_t) == 2 #else // not WCHAR // Convert file from UTF-16; it needs to be in memory as UTF-8 or MBCS unsigned short* pUTF16Buffer = new unsigned short[nUTF16Len+1]; bSuccess = x_Read( pUTF16Buffer, file ); nUTF16Len = file.nFileTextLen; ConvertEncoding convert( MCD_ENC, MCD_T("UTF-16"), (const void*)pUTF16Buffer, nUTF16Len ); int nMBLen = x_ConvertEncoding( convert, NULL ); int nBufferSizeForGrow = nMBLen + nMBLen/100; // extra 1% MCD_CHAR* pMBBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); x_ConvertEncoding( convert, (void*)pMBBuffer ); delete [] pUTF16Buffer; MCD_RELEASEBUFFER(strDoc,pMBBuffer,nMBLen); MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to %d bytes "), nMBLen ); if ( convert.nFailedChars ) MCD_PSZCAT( szReadInfo, MCD_T("(chars lost in conversion!) ") ); #endif // not WCHAR } else // single or multibyte file (i.e. not UTF-16) { #if defined(MARKUP_WCHAR) // WCHAR char* pBuffer = new char[file.nReadByteLen]; bSuccess = x_Read( pBuffer, file ); if ( MCD_STRISEMPTY(file.strEncoding) ) { if ( DetectUTF8(pBuffer,file.nReadByteLen) ) file.strEncoding = MCD_T("UTF-8"); file.strIOResult += MCD_T("(used UTF-8 detection) "); } ConvertEncoding convert( MCD_ENC, file.strEncoding, (const void*)pBuffer, file.nFileTextLen ); int nWideLen = x_ConvertEncoding( convert, NULL ); int nBufferSizeForGrow = nWideLen + nWideLen/100; // extra 1% MCD_CHAR* pWideBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); x_ConvertEncoding( convert, (void*)pWideBuffer ); MCD_RELEASEBUFFER( strDoc, pWideBuffer, nWideLen ); delete [] pBuffer; MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to %d wide chars "), nWideLen ); #else // not WCHAR // After loading a file with unknown multi-byte encoding bool bAssumeUnknownIsNative = false; if ( MCD_STRISEMPTY(file.strEncoding) ) { bAssumeUnknownIsNative = true; file.strEncoding = MCD_ENC; } if ( x_CanConvert(MCD_ENC,file.strEncoding) ) { char* pBuffer = new char[file.nReadByteLen]; bSuccess = x_Read( pBuffer, file ); ConvertEncoding convert( MCD_ENC, file.strEncoding, (const void*)pBuffer, file.nFileTextLen ); int nMBLen = x_ConvertEncoding( convert, NULL ); int nBufferSizeForGrow = nMBLen + nMBLen/100; // extra 1% MCD_CHAR* pMBBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); x_ConvertEncoding( convert, (void*)pMBBuffer ); MCD_RELEASEBUFFER( strDoc, pMBBuffer, nMBLen ); delete [] pBuffer; MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to length %d "), nMBLen ); if ( convert.nFailedChars ) MCD_PSZCAT( szReadInfo, MCD_T("(chars lost in conversion!) ") ); } else // load directly into string { int nBufferSizeForGrow = file.nReadByteLen + file.nReadByteLen/100; // extra 1% MCD_CHAR* pBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow); bSuccess = x_Read( pBuffer, file ); MCD_RELEASEBUFFER( strDoc, pBuffer, file.nFileTextLen ); // Might need additional conversion if we assumed an encoding if ( bAssumeUnknownIsNative ) { int nNonASCII; bool bIsUTF8 = DetectUTF8( MCD_2PCSZ(strDoc), file.nReadByteLen, &nNonASCII ); MCD_STR strDetectedEncoding = bIsUTF8? MCD_T("UTF-8"): MCD_T(""); if ( nNonASCII && file.strEncoding != strDetectedEncoding ) // only need to convert non-ASCII { file.strEncoding = strDetectedEncoding; ConvertEncoding convert( MCD_ENC, file.strEncoding, MCD_2PCSZ(strDoc), file.nFileTextLen ); int nMBLen = x_ConvertEncoding( convert, NULL ); nBufferSizeForGrow = nMBLen + nMBLen/100; // extra 1% MCD_STR strConvDoc; MCD_CHAR* pBuffer = MCD_GETBUFFER(strConvDoc,nBufferSizeForGrow); x_ConvertEncoding( convert, (void*)pBuffer ); MCD_RELEASEBUFFER( strConvDoc, pBuffer, nMBLen ); strDoc = strConvDoc; MCD_SPRINTF( MCD_SSZ(szReadInfo), MCD_T("to length %d "), nMBLen ); if ( convert.nFailedChars ) MCD_PSZCAT( szReadInfo, MCD_T("(chars lost in conversion!) ") ); } MCD_PSZCAT( szReadInfo, MCD_T("(used UTF-8 detection) ") ); } } #endif // not WCHAR } if ( bSuccess ) file.strIOResult += szReadInfo; else file.strIOResult = x_GetLastError(); return bSuccess; } bool CMarkup::x_Write( void* pBuffer, FilePos& file, const void* pConstBuffer /*=NULL*/ ) { MCD_CHAR szWriteInfo[100] = {0}; size_t nDocByteLength = file.nFileTextLen * file.nFileCharUnitSize; if ( ! pConstBuffer ) pConstBuffer = pBuffer; unsigned short* pTempEndianBuffer = NULL; if ( x_EndianSwapRequired(file.nDocFlags) ) { if ( ! pBuffer ) { pTempEndianBuffer = new unsigned short[file.nFileTextLen]; memcpy( pTempEndianBuffer, pConstBuffer, file.nFileTextLen * 2 ); pBuffer = pTempEndianBuffer; pConstBuffer = pTempEndianBuffer; } x_EndianSwapUTF16( (unsigned short*)pBuffer, file.nFileTextLen ); file.strIOResult += MCD_T("endian swap "); } bool bSuccess = ( fwrite( pConstBuffer, nDocByteLength, 1, file.fp ) == 1 ); if ( pTempEndianBuffer ) delete [] pTempEndianBuffer; if ( bSuccess ) { MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("length %d "), file.nFileTextLen ); file.strIOResult += szWriteInfo; } else file.strIOResult = x_GetLastError(); return bSuccess; } bool CMarkup::x_WriteText( const MCD_STR& strDoc, FilePos& file ) { bool bSuccess = true; MCD_CHAR szWriteInfo[100] = {0}; MCD_STRCLEAR( file.strIOResult ); MCD_PCSZ pDoc = MCD_2PCSZ(strDoc); int nWriteStrLen = MCD_STRLENGTH(strDoc); if ( ! nWriteStrLen ) { file.strIOResult = MCD_T("0 length "); return bSuccess; } if ( file.nDocFlags & (MDF_UTF16LEFILE | MDF_UTF16BEFILE) ) { #if defined(MARKUP_WCHAR) // WCHAR #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4 ConvertEncoding convert( MCD_T("UTF-16"), MCD_T("UTF-32"), (const void*)pDoc, nWriteStrLen ); file.nFileTextLen = x_ConvertEncoding( convert, NULL ); unsigned short* pUTF16Buffer = new unsigned short[file.nFileTextLen]; x_ConvertEncoding( convert, (void*)pUTF16Buffer ); MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("from wide-32 length %d "), nWriteStrLen ); bSuccess = x_Write( pUTF16Buffer, file ); delete [] pUTF16Buffer; #else // sizeof(wchar_t) == 2 file.nFileTextLen = nWriteStrLen; bSuccess = x_Write( NULL, file, pDoc ); #endif #else // not WCHAR ConvertEncoding convert( MCD_T("UTF-16"), MCD_ENC, (const void*)pDoc, nWriteStrLen ); file.nFileTextLen = x_ConvertEncoding( convert, NULL ); unsigned short* pUTF16Buffer = new unsigned short[file.nFileTextLen]; x_ConvertEncoding( convert, (void*)pUTF16Buffer ); MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("from byte length %d "), nWriteStrLen ); bSuccess = x_Write( pUTF16Buffer, file ); delete [] pUTF16Buffer; #endif // not WCHAR } else // single or multibyte file (i.e. not UTF-16) { #if defined(MARKUP_WCHAR) // WCHAR ConvertEncoding convert( file.strEncoding, MCD_ENC, (const void*)pDoc, nWriteStrLen ); file.nFileTextLen = x_ConvertEncoding( convert, NULL ); char* pMBBuffer = new char[file.nFileTextLen]; x_ConvertEncoding( convert, (void*)pMBBuffer ); MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("from wide length %d "), nWriteStrLen ); if ( convert.nFailedChars ) MCD_PSZCAT( szWriteInfo, MCD_T("(chars lost in conversion!) ") ); bSuccess = x_Write( pMBBuffer, file ); delete [] pMBBuffer; #else // not WCHAR if ( x_CanConvert(file.strEncoding,MCD_ENC) ) { ConvertEncoding convert( file.strEncoding, MCD_ENC, (const void*)pDoc, nWriteStrLen ); file.nFileTextLen = x_ConvertEncoding( convert, NULL ); char* pMBBuffer = new char[file.nFileTextLen]; x_ConvertEncoding( convert, (void*)pMBBuffer ); MCD_SPRINTF( MCD_SSZ(szWriteInfo), MCD_T("from length %d "), nWriteStrLen ); if ( convert.nFailedChars ) MCD_PSZCAT( szWriteInfo, MCD_T("(chars lost in conversion!) ") ); bSuccess = x_Write( pMBBuffer, file ); delete [] pMBBuffer; } else // save directly from string { file.nFileTextLen = nWriteStrLen; bSuccess = x_Write( NULL, file, pDoc ); } #endif // not WCHAR } if ( bSuccess ) file.strIOResult += szWriteInfo; else file.strIOResult = x_GetLastError(); return bSuccess; } bool CMarkup::x_Close( FilePos& file ) { if ( file.fp ) { fclose( file.fp ); file.fp = NULL; file.nDocFlags &= ~(MDF_WRITEFILE|MDF_READFILE|MDF_APPENDFILE); return true; } return false; }