00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #ifdef HAVE_CONFIG_H
00036 #include "config.h"
00037 #endif
00038
00039
00040 #include "html/htmltokenizer.h"
00041 #include "html/html_documentimpl.h"
00042 #include "html/htmlparser.h"
00043 #include "html/dtd.h"
00044
00045 #include "misc/loader.h"
00046 #include "misc/htmlhashes.h"
00047
00048 #include "khtmlview.h"
00049 #include "khtml_part.h"
00050 #include "xml/dom_docimpl.h"
00051 #include "css/csshelper.h"
00052 #include "ecma/kjs_proxy.h"
00053 #include <kcharsets.h>
00054 #include <kglobal.h>
00055 #include <ctype.h>
00056 #include <assert.h>
00057 #include <qvariant.h>
00058 #include <kdebug.h>
00059 #include <stdlib.h>
00060
00061 #include "kentities.c"
00062
00063 using namespace khtml;
00064
00065 static const QChar commentStart [] = { '<','!','-','-', QChar::null };
00066
00067 static const char scriptEnd [] = "</script";
00068 static const char xmpEnd [] = "</xmp";
00069 static const char styleEnd [] = "</style";
00070 static const char textareaEnd [] = "</textarea";
00071 static const char titleEnd [] = "</title";
00072
00073 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
00074 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) P = realloc(p, sizeof(QChar)*( N ))
00075 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089 #if 0
00090 #define fixUpChar(x)
00091 #else
00092 #define fixUpChar(x) \
00093 if (!(x).row() ) { \
00094 switch ((x).cell()) \
00095 { \
00096 \
00097 case 0x80: (x) = 0x20ac; break; \
00098 case 0x82: (x) = ','; break; \
00099 case 0x83: (x) = 0x0192; break; \
00100 case 0x84: (x) = '"'; break; \
00101 case 0x85: (x) = 0x2026; break; \
00102 case 0x86: (x) = 0x2020; break; \
00103 case 0x87: (x) = 0x2021; break; \
00104 case 0x88: (x) = 0x02C6; break; \
00105 case 0x89: (x) = 0x2030; break; \
00106 case 0x8A: (x) = 0x0160; break; \
00107 case 0x8b: (x) = '<'; break; \
00108 case 0x8C: (x) = 0x0152; break; \
00109 \
00110 case 0x8E: (x) = 0x017D; break; \
00111 \
00112 \
00113 case 0x91: (x) = '\''; break; \
00114 case 0x92: (x) = '\''; break; \
00115 case 0x93: (x) = '"'; break; \
00116 case 0x94: (x) = '"'; break; \
00117 case 0x95: (x) = '*'; break; \
00118 case 0x96: (x) = '-'; break; \
00119 case 0x97: (x) = '-'; break; \
00120 case 0x98: (x) = '~'; break; \
00121 case 0x99: (x) = 0x2122; break; \
00122 case 0x9A: (x) = 0x0161; break; \
00123 case 0x9b: (x) = '>'; break; \
00124 case 0x9C: (x) = 0x0153; break; \
00125 \
00126 case 0x9E: (x) = 0x017E; break; \
00127 case 0x9F: (x) = 0x0178; break; \
00128 default: break; \
00129 } \
00130 }
00131 #endif
00132
00133
00134
00135 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, KHTMLView *_view)
00136 {
00137 view = _view;
00138 buffer = 0;
00139 scriptCode = 0;
00140 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00141 charsets = KGlobal::charsets();
00142 parser = new KHTMLParser(_view, _doc);
00143 m_executingScript = 0;
00144 onHold = false;
00145
00146 reset();
00147 }
00148
00149 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, DOM::DocumentFragmentImpl *i)
00150 {
00151 view = 0;
00152 buffer = 0;
00153 scriptCode = 0;
00154 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00155 charsets = KGlobal::charsets();
00156 parser = new KHTMLParser( i, _doc );
00157 m_executingScript = 0;
00158 onHold = false;
00159
00160 reset();
00161 }
00162
00163 void HTMLTokenizer::reset()
00164 {
00165 assert(m_executingScript == 0);
00166 assert(onHold == false);
00167
00168 while (!cachedScript.isEmpty())
00169 cachedScript.dequeue()->deref(this);
00170
00171 if ( buffer )
00172 KHTML_DELETE_QCHAR_VEC(buffer);
00173 buffer = dest = 0;
00174 size = 0;
00175
00176 if ( scriptCode )
00177 KHTML_DELETE_QCHAR_VEC(scriptCode);
00178 scriptCode = 0;
00179 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00180
00181 currToken.reset();
00182 }
00183
00184 void HTMLTokenizer::begin()
00185 {
00186 m_executingScript = 0;
00187 onHold = false;
00188 reset();
00189 size = 254;
00190 buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
00191 dest = buffer;
00192 tag = NoTag;
00193 pending = NonePending;
00194 discard = NoneDiscard;
00195 pre = false;
00196 prePos = 0;
00197 plaintext = false;
00198 xmp = false;
00199 processingInstruction = false;
00200 script = false;
00201 escaped = false;
00202 style = false;
00203 skipLF = false;
00204 select = false;
00205 comment = false;
00206 server = false;
00207 textarea = false;
00208 title = false;
00209 startTag = false;
00210 tquote = NoQuote;
00211 searchCount = 0;
00212 Entity = NoEntity;
00213 noMoreData = false;
00214 brokenComments = false;
00215 brokenServer = false;
00216 brokenScript = false;
00217 lineno = 0;
00218 scriptStartLineno = 0;
00219 tagStartLineno = 0;
00220 }
00221
00222 void HTMLTokenizer::processListing(DOMStringIt list)
00223 {
00224 bool old_pre = pre;
00225
00226
00227
00228
00229 if(!style) pre = true;
00230 prePos = 0;
00231
00232 while ( list.length() )
00233 {
00234 checkBuffer(3*TAB_SIZE);
00235
00236 if (skipLF && ( *list != '\n' ))
00237 {
00238 skipLF = false;
00239 }
00240
00241 if (skipLF)
00242 {
00243 skipLF = false;
00244 ++list;
00245 }
00246 else if (( *list == '\n' ) || ( *list == '\r' ))
00247 {
00248 if (discard == LFDiscard)
00249 {
00250
00251 discard = NoneDiscard;
00252 }
00253 else
00254 {
00255
00256 if (pending)
00257 addPending();
00258 pending = LFPending;
00259 }
00260
00261 if (*list == '\r')
00262 {
00263 skipLF = true;
00264 }
00265 ++list;
00266 }
00267 else if (( *list == ' ' ) || ( *list == '\t'))
00268 {
00269 if (pending)
00270 addPending();
00271 if (*list == ' ')
00272 pending = SpacePending;
00273 else
00274 pending = TabPending;
00275
00276 ++list;
00277 }
00278 else
00279 {
00280 discard = NoneDiscard;
00281 if (pending)
00282 addPending();
00283
00284 prePos++;
00285 *dest++ = *list;
00286 ++list;
00287 }
00288
00289 }
00290
00291 if ((pending == SpacePending) || (pending == TabPending))
00292 addPending();
00293 else
00294 pending = NonePending;
00295
00296 prePos = 0;
00297 pre = old_pre;
00298 }
00299
00300 void HTMLTokenizer::parseSpecial(DOMStringIt &src)
00301 {
00302 assert( textarea || title || !Entity );
00303 assert( !tag );
00304 assert( xmp+textarea+title+style+script == 1 );
00305 if (script)
00306 scriptStartLineno = lineno+src.lineCount();
00307
00308 if ( comment ) parseComment( src );
00309
00310 while ( src.length() ) {
00311 checkScriptBuffer();
00312 unsigned char ch = src->latin1();
00313 if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && !title && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && QConstString( scriptCode+scriptCodeSize-3, 3 ).string() == "<!-" ) {
00314 comment = true;
00315 parseComment( src );
00316 continue;
00317 }
00318 if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
00319 ++src;
00320 scriptCodeSize = scriptCodeResync-1;
00321 scriptCodeResync = 0;
00322 scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
00323 if ( script )
00324 scriptHandler();
00325 else {
00326 processListing(DOMStringIt(scriptCode, scriptCodeSize));
00327 processToken();
00328 if ( style ) { currToken.id = ID_STYLE + ID_CLOSE_TAG; }
00329 else if ( textarea ) { currToken.id = ID_TEXTAREA + ID_CLOSE_TAG; }
00330 else if ( title ) { currToken.id = ID_TITLE + ID_CLOSE_TAG; }
00331 else if ( xmp ) { currToken.id = ID_XMP + ID_CLOSE_TAG; }
00332 processToken();
00333 style = script = style = textarea = title = xmp = false;
00334 tquote = NoQuote;
00335 scriptCodeSize = scriptCodeResync = 0;
00336 }
00337 return;
00338 }
00339
00340 if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
00341 scriptCodeSize >= searchStopperLen &&
00342 !QConstString( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).string().find( searchStopper, 0, false )) {
00343 scriptCodeResync = scriptCodeSize-searchStopperLen+1;
00344 tquote = NoQuote;
00345 continue;
00346 }
00347 if ( scriptCodeResync && !escaped ) {
00348 if(ch == '\"')
00349 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
00350 else if(ch == '\'')
00351 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
00352 else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
00353 tquote = NoQuote;
00354 }
00355 escaped = ( !escaped && ch == '\\' );
00356 if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
00357 QChar *scriptCodeDest = scriptCode+scriptCodeSize;
00358 ++src;
00359 parseEntity(src,scriptCodeDest,true);
00360 scriptCodeSize = scriptCodeDest-scriptCode;
00361 }
00362 else {
00363 scriptCode[ scriptCodeSize++ ] = *src;
00364 ++src;
00365 }
00366 }
00367 }
00368
00369 void HTMLTokenizer::scriptHandler()
00370 {
00371 QString currentScriptSrc = scriptSrc;
00372 scriptSrc = QString::null;
00373
00374 processListing(DOMStringIt(scriptCode, scriptCodeSize));
00375 QString exScript( buffer, dest-buffer );
00376
00377 processToken();
00378 currToken.id = ID_SCRIPT + ID_CLOSE_TAG;
00379 processToken();
00380
00381 QString prependingSrc;
00382
00383 if ( !parser->skipMode() ) {
00384 CachedScript* cs = 0;
00385
00386
00387 if ( !currentScriptSrc.isEmpty() &&
00388 (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) ))
00389 cachedScript.enqueue(cs);
00390
00391 if (cs) {
00392 pendingSrc.prepend( QString(src.current(), src.length() ) );
00393 setSrc(QString::null);
00394 scriptCodeSize = scriptCodeResync = 0;
00395 cs->ref(this);
00396
00397 }
00398 else if (currentScriptSrc.isEmpty() && view && javascript ) {
00399 if ( !m_executingScript )
00400 pendingSrc.prepend( QString( src.current(), src.length() ) );
00401 else
00402 prependingSrc = QString( src.current(), src.length() );
00403
00404 setSrc(QString::null);
00405 scriptCodeSize = scriptCodeResync = 0;
00406 scriptExecution( exScript, QString::null, tagStartLineno );
00407 }
00408 }
00409
00410 script = false;
00411 scriptCodeSize = scriptCodeResync = 0;
00412
00413 if ( !m_executingScript && cachedScript.isEmpty() ) {
00414
00415 QString newStr = QString(src.current(), src.length());
00416 newStr += pendingSrc;
00417 setSrc(newStr);
00418 pendingSrc = QString::null;
00419 }
00420 else if ( !prependingSrc.isEmpty() )
00421 write( prependingSrc, false );
00422 }
00423
00424 void HTMLTokenizer::scriptExecution( const QString& str, QString scriptURL,
00425 int baseLine)
00426 {
00427 bool oldscript = script;
00428 m_executingScript++;
00429 script = false;
00430 QString url;
00431 if (scriptURL.isNull())
00432 url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL();
00433 else
00434 url = scriptURL;
00435
00436 view->part()->executeScript(url,baseLine,Node(),str);
00437 m_executingScript--;
00438 script = oldscript;
00439 }
00440
00441 void HTMLTokenizer::parseComment(DOMStringIt &src)
00442 {
00443 checkScriptBuffer(src.length());
00444 while ( src.length() ) {
00445 scriptCode[ scriptCodeSize++ ] = *src;
00446 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00447 qDebug("comment is now: *%s*",
00448 QConstString((QChar*)src.current(), QMIN(16, src.length())).string().latin1());
00449 #endif
00450 if (src->unicode() == '>' &&
00451 ( ( brokenComments && !( script || style ) ) ||
00452 ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' &&
00453 scriptCode[scriptCodeSize-2] == '-' ) ) ) {
00454 ++src;
00455 if ( !( script || xmp || textarea || style) ) {
00456 #ifdef COMMENTS_IN_DOM
00457 checkScriptBuffer();
00458 scriptCode[ scriptCodeSize ] = 0;
00459 scriptCode[ scriptCodeSize + 1 ] = 0;
00460 currToken.id = ID_COMMENT;
00461 processListing(DOMStringIt(scriptCode, scriptCodeSize - 2));
00462 processToken();
00463 currToken.id = ID_COMMENT + ID_CLOSE_TAG;
00464 processToken();
00465 #endif
00466 scriptCodeSize = 0;
00467 }
00468 comment = false;
00469 return;
00470 }
00471 ++src;
00472 }
00473 }
00474
00475 void HTMLTokenizer::parseServer(DOMStringIt &src)
00476 {
00477 checkScriptBuffer(src.length());
00478 while ( src.length() ) {
00479 scriptCode[ scriptCodeSize++ ] = *src;
00480 if (src->unicode() == '>' &&
00481 scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
00482 ++src;
00483 server = false;
00484 scriptCodeSize = 0;
00485 return;
00486 }
00487 ++src;
00488 }
00489 }
00490
00491 void HTMLTokenizer::parseProcessingInstruction(DOMStringIt &src)
00492 {
00493 char oldchar = 0;
00494 while ( src.length() )
00495 {
00496 unsigned char chbegin = src->latin1();
00497 if(chbegin == '\'') {
00498 tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
00499 }
00500 else if(chbegin == '\"') {
00501 tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
00502 }
00503
00504
00505
00506 else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
00507 {
00508
00509 processingInstruction = false;
00510 ++src;
00511 discard=LFDiscard;
00512 return;
00513 }
00514 ++src;
00515 oldchar = chbegin;
00516 }
00517 }
00518
00519 void HTMLTokenizer::parseText(DOMStringIt &src)
00520 {
00521 while ( src.length() )
00522 {
00523
00524 checkBuffer();
00525
00526
00527 unsigned char chbegin = src->latin1();
00528
00529 if (skipLF && ( chbegin != '\n' ))
00530 {
00531 skipLF = false;
00532 }
00533
00534 if (skipLF)
00535 {
00536 skipLF = false;
00537 ++src;
00538 }
00539 else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
00540 {
00541 if (chbegin == '\r')
00542 skipLF = true;
00543
00544 *dest++ = '\n';
00545 ++src;
00546 }
00547 else {
00548 *dest++ = *src;
00549 ++src;
00550 }
00551 }
00552 }
00553
00554
00555 void HTMLTokenizer::parseEntity(DOMStringIt &src, QChar *&dest, bool start)
00556 {
00557 if( start )
00558 {
00559 cBufferPos = 0;
00560 Entity = SearchEntity;
00561 }
00562
00563 while( src.length() )
00564 {
00565 ushort cc = src->unicode();
00566 switch(Entity) {
00567 case NoEntity:
00568 return;
00569
00570 break;
00571 case SearchEntity:
00572 if(cc == '#') {
00573 cBuffer[cBufferPos++] = cc;
00574 ++src;
00575 Entity = NumericSearch;
00576 }
00577 else
00578 Entity = EntityName;
00579
00580 break;
00581
00582 case NumericSearch:
00583 if(cc == 'x' || cc == 'X') {
00584 cBuffer[cBufferPos++] = cc;
00585 ++src;
00586 Entity = Hexadecimal;
00587 }
00588 else if(cc >= '0' && cc <= '9')
00589 Entity = Decimal;
00590 else
00591 Entity = SearchSemicolon;
00592
00593 break;
00594
00595 case Hexadecimal:
00596 {
00597 int uc = EntityChar.unicode();
00598 int ll = kMin(src.length(), 9-cBufferPos);
00599 while(ll--) {
00600 QChar csrc(src->lower());
00601 cc = csrc.cell();
00602
00603 if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
00604 Entity = SearchSemicolon;
00605 break;
00606 }
00607 uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
00608 cBuffer[cBufferPos++] = cc;
00609 ++src;
00610 }
00611 EntityChar = QChar(uc);
00612 if(cBufferPos == 9) Entity = SearchSemicolon;
00613 break;
00614 }
00615 case Decimal:
00616 {
00617 int uc = EntityChar.unicode();
00618 int ll = kMin(src.length(), 9-cBufferPos);
00619 while(ll--) {
00620 cc = src->cell();
00621
00622 if(src->row() || !(cc >= '0' && cc <= '9')) {
00623 Entity = SearchSemicolon;
00624 break;
00625 }
00626
00627 uc = uc * 10 + (cc - '0');
00628 cBuffer[cBufferPos++] = cc;
00629 ++src;
00630 }
00631 EntityChar = QChar(uc);
00632 if(cBufferPos == 9) Entity = SearchSemicolon;
00633 break;
00634 }
00635 case EntityName:
00636 {
00637 int ll = kMin(src.length(), 9-cBufferPos);
00638 while(ll--) {
00639 QChar csrc = *src;
00640 cc = csrc.cell();
00641
00642 if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
00643 (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
00644 Entity = SearchSemicolon;
00645 break;
00646 }
00647
00648 cBuffer[cBufferPos++] = cc;
00649 ++src;
00650 }
00651 if(cBufferPos == 9) Entity = SearchSemicolon;
00652 if(Entity == SearchSemicolon) {
00653 if(cBufferPos > 1) {
00654 const entity *e = findEntity(cBuffer, cBufferPos);
00655 if(e)
00656 EntityChar = e->code;
00657
00658
00659 if(tag && EntityChar.unicode() > 255 && *src != ';')
00660 EntityChar = QChar::null;
00661 }
00662 }
00663 else
00664 break;
00665 }
00666 case SearchSemicolon:
00667
00668
00669
00670 fixUpChar(EntityChar);
00671
00672 if ( EntityChar != QChar::null ) {
00673 checkBuffer();
00674
00675 if (*src == ';')
00676 ++src;
00677
00678 src.push( EntityChar );
00679 } else {
00680 #ifdef TOKEN_DEBUG
00681 kdDebug( 6036 ) << "unknown entity!" << endl;
00682 #endif
00683 checkBuffer(10);
00684
00685 *dest++ = '&';
00686 for(unsigned int i = 0; i < cBufferPos; i++)
00687 dest[i] = cBuffer[i];
00688 dest += cBufferPos;
00689 Entity = NoEntity;
00690 if (pre)
00691 prePos += cBufferPos+1;
00692 }
00693
00694 Entity = NoEntity;
00695 EntityChar = QChar::null;
00696 return;
00697 };
00698 }
00699 }
00700
00701 void HTMLTokenizer::parseTag(DOMStringIt &src)
00702 {
00703 assert(!Entity );
00704
00705 while ( src.length() )
00706 {
00707 checkBuffer();
00708 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00709 uint l = 0;
00710 while(l < src.length() && (*(src.current()+l)).latin1() != '>')
00711 l++;
00712 qDebug("src is now: *%s*, tquote: %d",
00713 QConstString((QChar*)src.current(), l).string().latin1(), tquote);
00714 #endif
00715 switch(tag) {
00716 case NoTag:
00717 {
00718 return;
00719 }
00720 case TagName:
00721 {
00722 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00723 qDebug("TagName");
00724 #endif
00725 if (searchCount > 0)
00726 {
00727 if (*src == commentStart[searchCount])
00728 {
00729 searchCount++;
00730 if (searchCount == 4)
00731 {
00732 #ifdef TOKEN_DEBUG
00733 kdDebug( 6036 ) << "Found comment" << endl;
00734 #endif
00735
00736 ++src;
00737 dest = buffer;
00738 tag = NoTag;
00739
00740 comment = true;
00741
00742 checkScriptBuffer();
00743 scriptCode[0] = scriptCode[1] = '-';
00744 scriptCodeSize = 2;
00745 parseComment(src);
00746 return;
00747 }
00748
00749 cBuffer[cBufferPos++] = src->cell();
00750 ++src;
00751 break;
00752 }
00753 else
00754 searchCount = 0;
00755 }
00756
00757 bool finish = false;
00758 unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00759 while(ll--) {
00760 ushort curchar = *src;
00761 if(curchar <= ' ' || curchar == '>' ) {
00762 finish = true;
00763 break;
00764 }
00765
00766
00767
00768 char cc = curchar;
00769 cBuffer[cBufferPos++] = cc | 0x20;
00770 ++src;
00771 }
00772
00773
00774
00775 if(finish || CBUFLEN == cBufferPos) {
00776 bool beginTag;
00777 char* ptr = cBuffer;
00778 unsigned int len = cBufferPos;
00779 cBuffer[cBufferPos] = '\0';
00780 if ((cBufferPos > 0) && (*ptr == '/'))
00781 {
00782
00783 beginTag = false;
00784 ptr++;
00785 len--;
00786 }
00787 else
00788
00789 beginTag = true;
00790
00791 if(len > 1 && ptr[len-1] == '/' ) {
00792 ptr[--len] = '\0';
00793
00794 if (*src == '>')
00795 currToken.flat = true;
00796 }
00797
00798 uint tagID = khtml::getTagID(ptr, len);
00799 if (!tagID) {
00800 #ifdef TOKEN_DEBUG
00801 QCString tmp(ptr, len+1);
00802 kdDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"" << endl;
00803 #endif
00804 dest = buffer;
00805 }
00806 else
00807 {
00808 #ifdef TOKEN_DEBUG
00809 QCString tmp(ptr, len+1);
00810 kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl;
00811 #endif
00812 currToken.id = beginTag ? tagID : tagID + ID_CLOSE_TAG;
00813 dest = buffer;
00814 }
00815 tag = SearchAttribute;
00816 cBufferPos = 0;
00817 }
00818 break;
00819 }
00820 case SearchAttribute:
00821 {
00822 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00823 qDebug("SearchAttribute");
00824 #endif
00825 bool atespace = false;
00826 ushort curchar;
00827 while(src.length()) {
00828 curchar = *src;
00829 if(curchar > ' ') {
00830 if(curchar == '>')
00831 tag = SearchEnd;
00832 else if(atespace && (curchar == '\'' || curchar == '"'))
00833 {
00834 tag = SearchValue;
00835 *dest++ = 0;
00836 attrName = QString::null;
00837 }
00838 else
00839 tag = AttributeName;
00840
00841 cBufferPos = 0;
00842 break;
00843 }
00844 atespace = true;
00845 ++src;
00846 }
00847 break;
00848 }
00849 case AttributeName:
00850 {
00851 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00852 qDebug("AttributeName");
00853 #endif
00854 ushort curchar;
00855 int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00856
00857 while(ll--) {
00858 curchar = *src;
00859 if(curchar <= '>') {
00860 if(curchar <= ' ' || curchar == '=' || curchar == '>') {
00861 unsigned int a;
00862 cBuffer[cBufferPos] = '\0';
00863 a = khtml::getAttrID(cBuffer, cBufferPos);
00864 if ( !a )
00865 attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00866
00867 dest = buffer;
00868 *dest++ = a;
00869 #ifdef TOKEN_DEBUG
00870 if (!a || (cBufferPos && *cBuffer == '!'))
00871 kdDebug( 6036 ) << "Unknown attribute: *" << QCString(cBuffer, cBufferPos+1).data() << "*" << endl;
00872 else
00873 kdDebug( 6036 ) << "Known attribute: " << QCString(cBuffer, cBufferPos+1).data() << endl;
00874 #endif
00875
00876 if (!a && cBufferPos == 1 && *cBuffer == '/' && curchar == '>')
00877 currToken.flat = true;
00878
00879 tag = SearchEqual;
00880 break;
00881 }
00882 }
00883 cBuffer[cBufferPos++] = (char) curchar | 0x20;
00884 ++src;
00885 }
00886 if ( cBufferPos == CBUFLEN ) {
00887 cBuffer[cBufferPos] = '\0';
00888 attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00889 dest = buffer;
00890 *dest++ = 0;
00891 tag = SearchEqual;
00892 }
00893 break;
00894 }
00895 case SearchEqual:
00896 {
00897 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00898 qDebug("SearchEqual");
00899 #endif
00900 ushort curchar;
00901 bool atespace = false;
00902 while(src.length()) {
00903 curchar = src->unicode();
00904 if(curchar > ' ') {
00905 if(curchar == '=') {
00906 #ifdef TOKEN_DEBUG
00907 kdDebug(6036) << "found equal" << endl;
00908 #endif
00909 tag = SearchValue;
00910 ++src;
00911 }
00912 else if(atespace && (curchar == '\'' || curchar == '"'))
00913 {
00914 tag = SearchValue;
00915 *dest++ = 0;
00916 attrName = QString::null;
00917 }
00918 else {
00919 DOMString v("");
00920 currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00921 dest = buffer;
00922 tag = SearchAttribute;
00923 }
00924 break;
00925 }
00926 atespace = true;
00927 ++src;
00928 }
00929 break;
00930 }
00931 case SearchValue:
00932 {
00933 ushort curchar;
00934 while(src.length()) {
00935 curchar = src->unicode();
00936 if(curchar > ' ') {
00937 if(( curchar == '\'' || curchar == '\"' )) {
00938 tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
00939 tag = QuotedValue;
00940 ++src;
00941 } else
00942 tag = Value;
00943
00944 break;
00945 }
00946 ++src;
00947 }
00948 break;
00949 }
00950 case QuotedValue:
00951 {
00952 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00953 qDebug("QuotedValue");
00954 #endif
00955 ushort curchar;
00956 while(src.length()) {
00957 checkBuffer();
00958
00959 curchar = src->unicode();
00960 if(curchar <= '\'' && !src.escaped()) {
00961
00962 if ( curchar == '&' )
00963 {
00964 ++src;
00965 parseEntity(src, dest, true);
00966 break;
00967 }
00968 else if ( (tquote == SingleQuote && curchar == '\'') ||
00969 (tquote == DoubleQuote && curchar == '\"') )
00970 {
00971
00972 while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
00973 dest--;
00974 DOMString v(buffer+1, dest-buffer-1);
00975 currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00976
00977 dest = buffer;
00978 tag = SearchAttribute;
00979 tquote = NoQuote;
00980 ++src;
00981 break;
00982 }
00983 }
00984 *dest++ = *src;
00985 ++src;
00986 }
00987 break;
00988 }
00989 case Value:
00990 {
00991 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00992 qDebug("Value");
00993 #endif
00994 ushort curchar;
00995 while(src.length()) {
00996 checkBuffer();
00997 curchar = src->unicode();
00998 if(curchar <= '>' && !src.escaped()) {
00999
01000 if ( curchar == '&' )
01001 {
01002 ++src;
01003 parseEntity(src, dest, true);
01004 break;
01005 }
01006
01007
01008 if ( curchar <= ' ' || curchar == '>' )
01009 {
01010 DOMString v(buffer+1, dest-buffer-1);
01011 currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
01012 dest = buffer;
01013 tag = SearchAttribute;
01014 break;
01015 }
01016 }
01017
01018 *dest++ = *src;
01019 ++src;
01020 }
01021 break;
01022 }
01023 case SearchEnd:
01024 {
01025 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
01026 qDebug("SearchEnd");
01027 #endif
01028 while(src.length()) {
01029 if(*src == '>')
01030 break;
01031
01032 if (*src == '/')
01033 currToken.flat = true;
01034
01035 ++src;
01036 }
01037 if(!src.length() && *src != '>') break;
01038
01039 searchCount = 0;
01040 tag = NoTag;
01041 tquote = NoQuote;
01042 ++src;
01043
01044 if ( !currToken.id )
01045 return;
01046
01047 uint tagID = currToken.id;
01048 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
01049 kdDebug( 6036 ) << "appending Tag: " << tagID << endl;
01050 #endif
01051 bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
01052
01053 if(tagID >= ID_CLOSE_TAG)
01054 tagID -= ID_CLOSE_TAG;
01055 else if ( beginTag && !brokenScript && tagID == ID_SCRIPT ) {
01056 AttributeImpl* a = 0;
01057 scriptSrc = scriptSrcCharset = QString::null;
01058 if ( currToken.attrs &&
01059 parser->doc()->view()->part()->jScriptEnabled() &&
01060 view
01061 ) {
01062 if ( ( a = currToken.attrs->getAttributeItem( ATTR_SRC ) ) )
01063 scriptSrc = parser->doc()->completeURL(khtml::parseURL( a->value() ).string() );
01064 if ( ( a = currToken.attrs->getAttributeItem( ATTR_CHARSET ) ) )
01065 scriptSrcCharset = a->value().string().stripWhiteSpace();
01066 if ( scriptSrcCharset.isEmpty() )
01067 scriptSrcCharset = parser->doc()->view()->part()->encoding();
01068 if (!(a = currToken.attrs->getAttributeItem( ATTR_LANGUAGE )))
01069 a = currToken.attrs->getAttributeItem(ATTR_TYPE);
01070 }
01071 javascript = true;
01072 if( a ) {
01073 QString lang = a->value().string();
01074 lang = lang.lower();
01075 if( !lang.contains("javascript") &&
01076 !lang.contains("ecmascript") &&
01077 !lang.contains("livescript") &&
01078 !lang.contains("jscript") )
01079 javascript = false;
01080 }
01081 }
01082
01083 processToken();
01084
01085
01086 pre = parser->preMode();
01087
01088 switch( tagID ) {
01089 case ID_PRE:
01090 prePos = 0;
01091 break;
01092 case ID_SCRIPT:
01093 if (beginTag) {
01094 searchStopper = scriptEnd;
01095 searchStopperLen = 8;
01096 script = true;
01097 parseSpecial(src);
01098 }
01099 break;
01100 case ID_STYLE:
01101 if (beginTag) {
01102 searchStopper = styleEnd;
01103 searchStopperLen = 7;
01104 style = true;
01105 parseSpecial(src);
01106 }
01107 break;
01108 case ID_TEXTAREA:
01109 if(beginTag) {
01110 searchStopper = textareaEnd;
01111 searchStopperLen = 10;
01112 textarea = true;
01113 discard = AllDiscard;
01114 parseSpecial(src);
01115 }
01116 break;
01117 case ID_TITLE:
01118 if (beginTag) {
01119 searchStopper = titleEnd;
01120 searchStopperLen = 7;
01121 title = true;
01122 parseSpecial(src);
01123 }
01124 break;
01125 case ID_XMP:
01126 if (beginTag) {
01127 searchStopper = xmpEnd;
01128 searchStopperLen = 5;
01129 xmp = true;
01130 parseSpecial(src);
01131 }
01132 break;
01133 case ID_SELECT:
01134 select = beginTag;
01135 break;
01136 case ID_PLAINTEXT:
01137 plaintext = beginTag;
01138 break;
01139 }
01140 return;
01141 }
01142 }
01143 }
01144 return;
01145 }
01146
01147 void HTMLTokenizer::addPending()
01148 {
01149 if ( select && !(comment || script))
01150 {
01151 *dest++ = ' ';
01152 }
01153 else if ( textarea )
01154 {
01155 switch(pending) {
01156 case LFPending: *dest++ = '\n'; prePos = 0; break;
01157 case SpacePending: *dest++ = ' '; ++prePos; break;
01158 case TabPending: *dest++ = '\t'; prePos += TAB_SIZE - (prePos % TAB_SIZE); break;
01159 case NonePending:
01160 assert(0);
01161 }
01162 }
01163 else if ( pre )
01164 {
01165 int p;
01166
01167 switch (pending)
01168 {
01169 case SpacePending:
01170
01171 *dest++ = QChar(' ');
01172 prePos++;
01173 break;
01174
01175 case LFPending:
01176 *dest = '\n';
01177 dest++;
01178 prePos = 0;
01179 break;
01180
01181 case TabPending:
01182 p = TAB_SIZE - ( prePos % TAB_SIZE );
01183 for ( int x = 0; x < p; x++ )
01184 *dest++ = QChar(' ');
01185 prePos += p;
01186 break;
01187
01188 case NonePending:
01189 assert(0);
01190 break;
01191 }
01192 }
01193 else
01194 {
01195 *dest++ = ' ';
01196 }
01197
01198 pending = NonePending;
01199 }
01200
01201 void HTMLTokenizer::write( const QString &str, bool appendData )
01202 {
01203 #ifdef TOKEN_DEBUG
01204 kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str << "\"," << appendData << ")" << endl;
01205 #endif
01206
01207 if ( !buffer )
01208 return;
01209
01210 if ( ( m_executingScript && appendData ) ||
01211 ( !m_executingScript && cachedScript.count() ) ) {
01212
01213 pendingSrc += str;
01214 return;
01215 }
01216
01217 if ( onHold ) {
01218 QString rest = QString( src.current(), src.length() );
01219 rest += str;
01220 setSrc(rest);
01221 return;
01222 }
01223 else
01224 setSrc(str);
01225
01226
01227
01228
01229 while ( src.length() )
01230 {
01231
01232 checkBuffer();
01233
01234 ushort cc = src->unicode();
01235
01236 if (skipLF && (cc != '\n'))
01237 skipLF = false;
01238
01239 if (skipLF) {
01240 skipLF = false;
01241 ++src;
01242 }
01243 else if ( Entity )
01244 parseEntity( src, dest );
01245 else if ( plaintext )
01246 parseText( src );
01247 else if (script)
01248 parseSpecial(src);
01249 else if (style)
01250 parseSpecial(src);
01251 else if (xmp)
01252 parseSpecial(src);
01253 else if (textarea)
01254 parseSpecial(src);
01255 else if (title)
01256 parseSpecial(src);
01257 else if (comment)
01258 parseComment(src);
01259 else if (server)
01260 parseServer(src);
01261 else if (processingInstruction)
01262 parseProcessingInstruction(src);
01263 else if (tag)
01264 parseTag(src);
01265 else if ( startTag )
01266 {
01267 startTag = false;
01268
01269 switch(cc) {
01270 case '/':
01271 break;
01272 case '!':
01273 {
01274
01275 searchCount = 1;
01276
01277 break;
01278 }
01279 case '?':
01280 {
01281
01282 processingInstruction = true;
01283 tquote = NoQuote;
01284 parseProcessingInstruction(src);
01285 continue;
01286
01287 break;
01288 }
01289 case '%':
01290 if (!brokenServer) {
01291
01292 server = true;
01293 tquote = NoQuote;
01294 parseServer(src);
01295 continue;
01296 }
01297
01298 default:
01299 {
01300 if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
01301 {
01302
01303 }
01304 else
01305 {
01306
01307
01308 if (pending)
01309 addPending();
01310 *dest = '<';
01311 dest++;
01312 continue;
01313 }
01314 }
01315 };
01316
01317 if ( pending ) {
01318
01319 if ( pre )
01320 addPending();
01321
01322
01323 else if ( !parser->selectMode() &&
01324 ( !parser->noSpaces() || dest > buffer )) {
01325 addPending();
01326 discard = AllDiscard;
01327 }
01328
01329 else
01330 pending = NonePending;
01331 }
01332
01333 processToken();
01334
01335 cBufferPos = 0;
01336 tag = TagName;
01337 parseTag(src);
01338 }
01339 else if ( cc == '&' && !src.escaped())
01340 {
01341 ++src;
01342 if ( pending )
01343 addPending();
01344 parseEntity(src, dest, true);
01345 }
01346 else if ( cc == '<' && !src.escaped())
01347 {
01348 tagStartLineno = lineno+src.lineCount();
01349 ++src;
01350 startTag = true;
01351 }
01352 else if (( cc == '\n' ) || ( cc == '\r' ))
01353 {
01354 if ( pre || textarea)
01355 {
01356 if (discard == LFDiscard || discard == AllDiscard)
01357 {
01358
01359 discard = NoneDiscard;
01360 }
01361 else
01362 {
01363
01364 if (pending)
01365 addPending();
01366 pending = LFPending;
01367 }
01368 }
01369 else
01370 {
01371 if (discard == LFDiscard)
01372 {
01373
01374 discard = NoneDiscard;
01375 }
01376 else if(discard == AllDiscard)
01377 {
01378 }
01379 else
01380 {
01381
01382 if (pending == NonePending)
01383 pending = LFPending;
01384 }
01385 }
01386
01387 if (cc == '\r')
01388 {
01389 skipLF = true;
01390 }
01391 ++src;
01392 }
01393 else if (( cc == ' ' ) || ( cc == '\t' ))
01394 {
01395 if ( pre || textarea)
01396 {
01397 if (discard == SpaceDiscard || discard == AllDiscard)
01398 {
01399
01400 discard = NoneDiscard;
01401 }
01402 else {
01403 if (pending)
01404 addPending();
01405 if (cc == ' ')
01406 pending = SpacePending;
01407 else
01408 pending = TabPending;
01409 }
01410 }
01411 else
01412 {
01413 if(discard == SpaceDiscard)
01414 discard = NoneDiscard;
01415 else if(discard == AllDiscard)
01416 { }
01417 else
01418 pending = SpacePending;
01419 }
01420 ++src;
01421 }
01422 else
01423 {
01424 if (pending)
01425 addPending();
01426
01427 discard = NoneDiscard;
01428 if ( pre )
01429 {
01430 prePos++;
01431 }
01432 *dest = *src;
01433 fixUpChar( *dest );
01434 ++dest;
01435 ++src;
01436 }
01437 }
01438 _src = QString::null;
01439
01440 if (noMoreData && cachedScript.isEmpty() && !m_executingScript )
01441 end();
01442 }
01443
01444 void HTMLTokenizer::end()
01445 {
01446 if ( buffer == 0 ) {
01447 emit finishedParsing();
01448 return;
01449 }
01450
01451
01452 if ( !tag )
01453 processToken();
01454
01455 if(buffer)
01456 KHTML_DELETE_QCHAR_VEC(buffer);
01457
01458 if(scriptCode)
01459 KHTML_DELETE_QCHAR_VEC(scriptCode);
01460
01461 scriptCode = 0;
01462 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01463 buffer = 0;
01464 emit finishedParsing();
01465 }
01466
01467 void HTMLTokenizer::finish()
01468 {
01469
01470 while((title || script || comment || server) && scriptCode && scriptCodeSize)
01471 {
01472
01473 if (comment)
01474 brokenComments = true;
01475 else if (server)
01476 brokenServer = true;
01477 else if (script)
01478 brokenServer = true;
01479 checkScriptBuffer();
01480 scriptCode[ scriptCodeSize ] = 0;
01481 scriptCode[ scriptCodeSize + 1 ] = 0;
01482 int pos;
01483 QString food;
01484 if (title || script || style) {
01485 food.setUnicode(scriptCode, scriptCodeSize);
01486 }
01487 else if (server) {
01488 food = "<";
01489 food += QString(scriptCode, scriptCodeSize);
01490 }
01491 else {
01492 pos = QConstString(scriptCode, scriptCodeSize).string().find('>');
01493 food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1);
01494 }
01495 KHTML_DELETE_QCHAR_VEC(scriptCode);
01496 scriptCode = 0;
01497 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01498 if ( script )
01499 scriptHandler();
01500 comment = script = title = server = false;
01501 if ( !food.isEmpty() )
01502 write(food, true);
01503 }
01504
01505
01506 noMoreData = true;
01507 if (cachedScript.isEmpty() && !m_executingScript && !onHold)
01508 end();
01509 }
01510
01511 void HTMLTokenizer::processToken()
01512 {
01513 KJSProxy *jsProxy = view ? view->part()->jScript() : 0L;
01514 if (jsProxy)
01515 jsProxy->setEventHandlerLineno(tagStartLineno);
01516 if ( dest > buffer )
01517 {
01518 #ifdef TOKEN_DEBUG
01519 if(currToken.id) {
01520 qDebug( "unexpected token id: %d, str: *%s*", currToken.id,QConstString( buffer,dest-buffer ).string().latin1() );
01521 assert(0);
01522 }
01523
01524 #endif
01525 currToken.text = new DOMStringImpl( buffer, dest - buffer );
01526 currToken.text->ref();
01527 currToken.id = ID_TEXT;
01528 }
01529 else if(!currToken.id) {
01530 currToken.reset();
01531 if (jsProxy)
01532 jsProxy->setEventHandlerLineno(lineno+src.lineCount());
01533 return;
01534 }
01535
01536 dest = buffer;
01537
01538 #ifdef TOKEN_DEBUG
01539 QString name = getTagName(currToken.id).string();
01540 QString text;
01541 if(currToken.text)
01542 text = QConstString(currToken.text->s, currToken.text->l).string();
01543
01544 kdDebug( 6036 ) << "Token --> " << name << " id = " << currToken.id << endl;
01545 if (currToken.flat)
01546 kdDebug( 6036 ) << "Token is FLAT!" << endl;
01547 if(!text.isNull())
01548 kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
01549 unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
01550 if(l) {
01551 kdDebug( 6036 ) << "Attributes: " << l << endl;
01552 for (unsigned long i = 0; i < l; ++i) {
01553 AttributeImpl* c = currToken.attrs->attributeItem(i);
01554 kdDebug( 6036 ) << " " << c->id() << " " << parser->doc()->getDocument()->attrName(c->id()).string()
01555 << "=\"" << c->value().string() << "\"" << endl;
01556 }
01557 }
01558 kdDebug( 6036 ) << endl;
01559 #endif
01560
01561 parser->parseToken(&currToken);
01562
01563 if ( currToken.flat && currToken.id != ID_TEXT && !parser->noSpaces() )
01564 discard = NoneDiscard;
01565 else if ( parser->selectMode() )
01566 discard = AllDiscard;
01567
01568 currToken.reset();
01569 if (jsProxy)
01570 jsProxy->setEventHandlerLineno(0);
01571 }
01572
01573
01574 HTMLTokenizer::~HTMLTokenizer()
01575 {
01576 reset();
01577 delete parser;
01578 }
01579
01580
01581 void HTMLTokenizer::enlargeBuffer(int len)
01582 {
01583 int newsize = kMax(size*2, size+len);
01584 int oldoffs = (dest - buffer);
01585
01586 buffer = (QChar*)realloc(buffer, newsize*sizeof(QChar));
01587 dest = buffer + oldoffs;
01588 size = newsize;
01589 }
01590
01591 void HTMLTokenizer::enlargeScriptBuffer(int len)
01592 {
01593 int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
01594 scriptCode = (QChar*)realloc(scriptCode, newsize*sizeof(QChar));
01595 scriptCodeMaxSize = newsize;
01596 }
01597
01598 void HTMLTokenizer::notifyFinished(CachedObject* )
01599 {
01600 assert(!cachedScript.isEmpty());
01601 bool done = false;
01602 while (!done && cachedScript.head()->isLoaded()) {
01603 #ifdef TOKEN_DEBUG
01604 kdDebug( 6036 ) << "Finished loading an external script" << endl;
01605 #endif
01606 CachedScript* cs = cachedScript.dequeue();
01607 done = cachedScript.isEmpty();
01608 DOMString scriptSource = cs->script();
01609 #ifdef TOKEN_DEBUG
01610 kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl;
01611 #endif
01612 setSrc(QString::null);
01613
01614
01615
01616 QString cachedScriptUrl( cs->url().string() );
01617 cs->deref(this);
01618
01619 scriptExecution( scriptSource.string(), cachedScriptUrl );
01620
01621
01622
01623
01624 if ( !script ) {
01625 QString rest = pendingSrc;
01626 pendingSrc = QString::null;
01627 write(rest, false);
01628
01629
01630 }
01631 }
01632 }
01633
01634 void HTMLTokenizer::setSrc(const QString& source)
01635 {
01636 lineno += src.lineCount();
01637 _src = source;
01638 src = DOMStringIt(_src);
01639 }
01640
01641 void HTMLTokenizer::setOnHold(bool _onHold)
01642 {
01643 if (onHold == _onHold) return;
01644 onHold = _onHold;
01645 if (onHold)
01646 setSrc(QString(src.current(), src.length()));
01647 }
01648