00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifdef HAVE_CONFIG_H
00025 #include <config.h>
00026 #endif
00027
00028 #include <ctype.h>
00029 #include <stdlib.h>
00030 #include <stdio.h>
00031 #include <string.h>
00032 #include <assert.h>
00033
00034 #include "value.h"
00035 #include "object.h"
00036 #include "types.h"
00037 #include "interpreter.h"
00038 #include "nodes.h"
00039 #include "lexer.h"
00040 #include "ustring.h"
00041 #include "lookup.h"
00042 #include "internal.h"
00043
00044
00045 using namespace KJS;
00046
00047 static Lexer *currLexer = 0;
00048
00049 #ifndef KDE_USE_FINAL
00050 #include "grammar.h"
00051 #endif
00052
00053 #include "lexer.lut.h"
00054
00055 extern YYLTYPE yylloc;
00056
00057
00058 int kjsyylex()
00059 {
00060 return Lexer::curr()->lex();
00061 }
00062
00063 Lexer::Lexer()
00064 : yylineno(1),
00065 size8(128), size16(128), restrKeyword(false),
00066 eatNextIdentifier(false), stackToken(-1), lastToken(-1), pos(0),
00067 code(0), length(0),
00068 #ifndef KJS_PURE_ECMA
00069 bol(true),
00070 #endif
00071 current(0), next1(0), next2(0), next3(0)
00072 {
00073
00074 buffer8 = new char[size8];
00075 buffer16 = new UChar[size16];
00076 currLexer = this;
00077
00078 }
00079
00080 Lexer::~Lexer()
00081 {
00082 delete [] buffer8;
00083 delete [] buffer16;
00084 }
00085
00086 Lexer *Lexer::curr()
00087 {
00088 if (!currLexer) {
00089
00090 currLexer = new Lexer();
00091 }
00092 return currLexer;
00093 }
00094
00095 #ifdef KJS_DEBUG_MEM
00096 void Lexer::globalClear()
00097 {
00098 delete currLexer;
00099 currLexer = 0L;
00100 }
00101 #endif
00102
00103 void Lexer::setCode(const UChar *c, unsigned int len)
00104 {
00105 yylineno = 1;
00106 restrKeyword = false;
00107 delimited = false;
00108 eatNextIdentifier = false;
00109 stackToken = -1;
00110 lastToken = -1;
00111 pos = 0;
00112 code = c;
00113 length = len;
00114 skipLF = false;
00115 skipCR = false;
00116 #ifndef KJS_PURE_ECMA
00117 bol = true;
00118 #endif
00119
00120
00121 current = (length > 0) ? code[0].unicode() : 0;
00122 next1 = (length > 1) ? code[1].unicode() : 0;
00123 next2 = (length > 2) ? code[2].unicode() : 0;
00124 next3 = (length > 3) ? code[3].unicode() : 0;
00125 }
00126
00127 void Lexer::shift(unsigned int p)
00128 {
00129 while (p--) {
00130 pos++;
00131 current = next1;
00132 next1 = next2;
00133 next2 = next3;
00134 next3 = (pos + 3 < length) ? code[pos+3].unicode() : 0;
00135 }
00136 }
00137
00138
00139 void Lexer::nextLine()
00140 {
00141 yylineno++;
00142 #ifndef KJS_PURE_ECMA
00143 bol = true;
00144 #endif
00145 }
00146
00147 void Lexer::setDone(State s)
00148 {
00149 state = s;
00150 done = true;
00151 }
00152
00153 int Lexer::lex()
00154 {
00155 int token = 0;
00156 state = Start;
00157 unsigned short stringType = 0;
00158 pos8 = pos16 = 0;
00159 done = false;
00160 terminator = false;
00161 skipLF = false;
00162 skipCR = false;
00163
00164
00165
00166 if (stackToken >= 0) {
00167 setDone(Other);
00168 token = stackToken;
00169 stackToken = 0;
00170 }
00171
00172 while (!done) {
00173 if (skipLF && current != '\n')
00174 skipLF = false;
00175 if (skipCR && current != '\r')
00176 skipCR = false;
00177 if (skipLF || skipCR)
00178 {
00179 skipLF = false;
00180 skipCR = false;
00181 shift(1);
00182 }
00183
00184 bool cr = (current == '\r');
00185 bool lf = (current == '\n');
00186 if (cr)
00187 skipLF = true;
00188 else if (lf)
00189 skipCR = true;
00190 bool isLineTerminator = cr || lf;
00191
00192 switch (state) {
00193 case Start:
00194 if (isWhiteSpace()) {
00195
00196 } else if (current == '/' && next1 == '/') {
00197 shift(1);
00198 state = InSingleLineComment;
00199 } else if (current == '/' && next1 == '*') {
00200 shift(1);
00201 state = InMultiLineComment;
00202 } else if (current == 0) {
00203 if (!terminator && !delimited) {
00204
00205 token = ';';
00206 stackToken = 0;
00207 setDone(Other);
00208 } else
00209 setDone(Eof);
00210 } else if (isLineTerminator) {
00211 nextLine();
00212 terminator = true;
00213 if (restrKeyword) {
00214 token = ';';
00215 setDone(Other);
00216 }
00217 } else if (current == '"' || current == '\'') {
00218 state = InString;
00219 stringType = current;
00220 } else if (isIdentLetter(current)) {
00221 record16(current);
00222 state = InIdentifier;
00223 } else if (current == '0') {
00224 record8(current);
00225 state = InNum0;
00226 } else if (isDecimalDigit(current)) {
00227 record8(current);
00228 state = InNum;
00229 } else if (current == '.' && isDecimalDigit(next1)) {
00230 record8(current);
00231 state = InDecimal;
00232 #ifndef KJS_PURE_ECMA
00233
00234 } else if (current == '<' && next1 == '!' &&
00235 next2 == '-' && next3 == '-') {
00236 shift(3);
00237 state = InSingleLineComment;
00238
00239 } else if (bol && current == '-' && next1 == '-' && next2 == '>') {
00240 shift(2);
00241 state = InSingleLineComment;
00242 #endif
00243 } else {
00244 token = matchPunctuator(current, next1, next2, next3);
00245 if (token != -1) {
00246 setDone(Other);
00247 } else {
00248
00249 setDone(Bad);
00250 }
00251 }
00252 break;
00253 case InString:
00254 if (current == stringType) {
00255 shift(1);
00256 setDone(String);
00257 } else if (current == 0 || isLineTerminator) {
00258 setDone(Bad);
00259 } else if (current == '\\') {
00260 state = InEscapeSequence;
00261 } else {
00262 record16(current);
00263 }
00264 break;
00265
00266 case InEscapeSequence:
00267 if (isOctalDigit(current)) {
00268 if (current >= '0' && current <= '3' &&
00269 isOctalDigit(next1) && isOctalDigit(next2)) {
00270 record16(convertOctal(current, next1, next2));
00271 shift(2);
00272 state = InString;
00273 } else if (isOctalDigit(current) && isOctalDigit(next1)) {
00274 record16(convertOctal('0', current, next1));
00275 shift(1);
00276 state = InString;
00277 } else if (isOctalDigit(current)) {
00278 record16(convertOctal('0', '0', current));
00279 state = InString;
00280 } else {
00281 setDone(Bad);
00282 }
00283 } else if (current == 'x')
00284 state = InHexEscape;
00285 else if (current == 'u')
00286 state = InUnicodeEscape;
00287 else {
00288 record16(singleEscape(current));
00289 state = InString;
00290 }
00291 break;
00292 case InHexEscape:
00293 if (isHexDigit(current) && isHexDigit(next1)) {
00294 state = InString;
00295 record16(convertHex(current, next1));
00296 shift(1);
00297 } else if (current == stringType) {
00298 record16('x');
00299 shift(1);
00300 setDone(String);
00301 } else {
00302 record16('x');
00303 record16(current);
00304 state = InString;
00305 }
00306 break;
00307 case InUnicodeEscape:
00308 if (isHexDigit(current) && isHexDigit(next1) &&
00309 isHexDigit(next2) && isHexDigit(next3)) {
00310 record16(convertUnicode(current, next1, next2, next3));
00311 shift(3);
00312 state = InString;
00313 } else if (current == stringType) {
00314 record16('u');
00315 shift(1);
00316 setDone(String);
00317 } else {
00318 setDone(Bad);
00319 }
00320 break;
00321 case InSingleLineComment:
00322 if (isLineTerminator) {
00323 nextLine();
00324 terminator = true;
00325 if (restrKeyword) {
00326 token = ';';
00327 setDone(Other);
00328 } else
00329 state = Start;
00330 } else if (current == 0) {
00331 setDone(Eof);
00332 }
00333 break;
00334 case InMultiLineComment:
00335 if (current == 0) {
00336 setDone(Bad);
00337 } else if (isLineTerminator) {
00338 nextLine();
00339 } else if (current == '*' && next1 == '/') {
00340 state = Start;
00341 shift(1);
00342 }
00343 break;
00344 case InIdentifier:
00345 if (isIdentLetter(current) || isDecimalDigit(current)) {
00346 record16(current);
00347 break;
00348 }
00349 setDone(Identifier);
00350 break;
00351 case InNum0:
00352 if (current == 'x' || current == 'X') {
00353 record8(current);
00354 state = InHex;
00355 } else if (current == '.') {
00356 record8(current);
00357 state = InDecimal;
00358 } else if (current == 'e' || current == 'E') {
00359 record8(current);
00360 state = InExponentIndicator;
00361 } else if (isOctalDigit(current)) {
00362 record8(current);
00363 state = InOctal;
00364 } else if (isDecimalDigit(current)) {
00365 record8(current);
00366 state = InDecimal;
00367 } else {
00368 setDone(Number);
00369 }
00370 break;
00371 case InHex:
00372 if (isHexDigit(current)) {
00373 record8(current);
00374 } else {
00375 setDone(Hex);
00376 }
00377 break;
00378 case InOctal:
00379 if (isOctalDigit(current)) {
00380 record8(current);
00381 }
00382 else if (isDecimalDigit(current)) {
00383 record8(current);
00384 state = InDecimal;
00385 } else
00386 setDone(Octal);
00387 break;
00388 case InNum:
00389 if (isDecimalDigit(current)) {
00390 record8(current);
00391 } else if (current == '.') {
00392 record8(current);
00393 state = InDecimal;
00394 } else if (current == 'e' || current == 'E') {
00395 record8(current);
00396 state = InExponentIndicator;
00397 } else
00398 setDone(Number);
00399 break;
00400 case InDecimal:
00401 if (isDecimalDigit(current)) {
00402 record8(current);
00403 } else if (current == 'e' || current == 'E') {
00404 record8(current);
00405 state = InExponentIndicator;
00406 } else
00407 setDone(Number);
00408 break;
00409 case InExponentIndicator:
00410 if (current == '+' || current == '-') {
00411 record8(current);
00412 } else if (isDecimalDigit(current)) {
00413 record8(current);
00414 state = InExponent;
00415 } else
00416 setDone(Bad);
00417 break;
00418 case InExponent:
00419 if (isDecimalDigit(current)) {
00420 record8(current);
00421 } else
00422 setDone(Number);
00423 break;
00424 default:
00425 assert(!"Unhandled state in switch statement");
00426 }
00427
00428
00429 if (!done)
00430 shift(1);
00431 #ifndef KJS_PURE_ECMA
00432 if (state != Start && state != InSingleLineComment)
00433 bol = false;
00434 #endif
00435 }
00436
00437
00438 if ((state == Number || state == Octal || state == Hex)
00439 && isIdentLetter(current))
00440 state = Bad;
00441
00442
00443 buffer8[pos8] = '\0';
00444
00445 #ifdef KJS_DEBUG_LEX
00446 fprintf(stderr, "line: %d ", lineNo());
00447 fprintf(stderr, "yytext (%x): ", buffer8[0]);
00448 fprintf(stderr, "%s ", buffer8);
00449 #endif
00450
00451 double dval = 0;
00452 if (state == Number) {
00453 dval = strtod(buffer8, 0L);
00454 } else if (state == Hex) {
00455
00456 unsigned int i;
00457 sscanf(buffer8, "%x", &i);
00458 dval = i;
00459 state = Number;
00460 } else if (state == Octal) {
00461 unsigned int ui;
00462 sscanf(buffer8, "%o", &ui);
00463 dval = ui;
00464 state = Number;
00465 }
00466
00467 #ifdef KJS_DEBUG_LEX
00468 switch (state) {
00469 case Eof:
00470 printf("(EOF)\n");
00471 break;
00472 case Other:
00473 printf("(Other)\n");
00474 break;
00475 case Identifier:
00476 printf("(Identifier)/(Keyword)\n");
00477 break;
00478 case String:
00479 printf("(String)\n");
00480 break;
00481 case Number:
00482 printf("(Number)\n");
00483 break;
00484 default:
00485 printf("(unknown)");
00486 }
00487 #endif
00488
00489 if (state != Identifier && eatNextIdentifier)
00490 eatNextIdentifier = false;
00491
00492 restrKeyword = false;
00493 delimited = false;
00494 yylloc.first_line = yylineno;
00495 yylloc.last_line = yylineno;
00496
00497 switch (state) {
00498 case Eof:
00499 token = 0;
00500 break;
00501 case Other:
00502 if(token == '}' || token == ';') {
00503 delimited = true;
00504 }
00505 break;
00506 case Identifier:
00507 if ((token = Lookup::find(&mainTable, buffer16, pos16)) < 0) {
00508
00509
00510 if (eatNextIdentifier) {
00511 eatNextIdentifier = false;
00512 UString debugstr(buffer16, pos16); fprintf(stderr,"Anonymous function hack: eating identifier %s\n",debugstr.ascii());
00513 token = lex();
00514 break;
00515 }
00516
00517 kjsyylval.ustr = new UString(buffer16, pos16);
00518 token = IDENT;
00519 break;
00520 }
00521
00522 eatNextIdentifier = false;
00523
00524
00525
00526 if ( token == FUNCTION &&
00527 ( lastToken == '=' || lastToken == ',' ) )
00528 eatNextIdentifier = true;
00529
00530 if (token == CONTINUE || token == BREAK ||
00531 token == RETURN || token == THROW)
00532 restrKeyword = true;
00533 break;
00534 case String:
00535 kjsyylval.ustr = new UString(buffer16, pos16);
00536 token = STRING;
00537 break;
00538 case Number:
00539 kjsyylval.dval = dval;
00540 token = NUMBER;
00541 break;
00542 case Bad:
00543 fprintf(stderr, "KJS: yylex: ERROR.\n");
00544 return -1;
00545 default:
00546 assert(!"unhandled numeration value in switch");
00547 return -1;
00548 }
00549 lastToken = token;
00550 return token;
00551 }
00552
00553 bool Lexer::isWhiteSpace() const
00554 {
00555 return (current == ' ' || current == '\t' ||
00556 current == 0x0b || current == 0x0c);
00557 }
00558
00559 bool Lexer::isIdentLetter(unsigned short c)
00560 {
00561
00562 return (c >= 'a' && c <= 'z' ||
00563 c >= 'A' && c <= 'Z' ||
00564 c == '$' || c == '_');
00565 }
00566
00567 bool Lexer::isDecimalDigit(unsigned short c)
00568 {
00569 return (c >= '0' && c <= '9');
00570 }
00571
00572 bool Lexer::isHexDigit(unsigned short c) const
00573 {
00574 return (c >= '0' && c <= '9' ||
00575 c >= 'a' && c <= 'f' ||
00576 c >= 'A' && c <= 'F');
00577 }
00578
00579 bool Lexer::isOctalDigit(unsigned short c) const
00580 {
00581 return (c >= '0' && c <= '7');
00582 }
00583
00584 int Lexer::matchPunctuator(unsigned short c1, unsigned short c2,
00585 unsigned short c3, unsigned short c4)
00586 {
00587 if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
00588 shift(4);
00589 return URSHIFTEQUAL;
00590 } else if (c1 == '=' && c2 == '=' && c3 == '=') {
00591 shift(3);
00592 return STREQ;
00593 } else if (c1 == '!' && c2 == '=' && c3 == '=') {
00594 shift(3);
00595 return STRNEQ;
00596 } else if (c1 == '>' && c2 == '>' && c3 == '>') {
00597 shift(3);
00598 return URSHIFT;
00599 } else if (c1 == '<' && c2 == '<' && c3 == '=') {
00600 shift(3);
00601 return LSHIFTEQUAL;
00602 } else if (c1 == '>' && c2 == '>' && c3 == '=') {
00603 shift(3);
00604 return RSHIFTEQUAL;
00605 } else if (c1 == '<' && c2 == '=') {
00606 shift(2);
00607 return LE;
00608 } else if (c1 == '>' && c2 == '=') {
00609 shift(2);
00610 return GE;
00611 } else if (c1 == '!' && c2 == '=') {
00612 shift(2);
00613 return NE;
00614 } else if (c1 == '+' && c2 == '+') {
00615 shift(2);
00616 if (terminator)
00617 return AUTOPLUSPLUS;
00618 else
00619 return PLUSPLUS;
00620 } else if (c1 == '-' && c2 == '-') {
00621 shift(2);
00622 if (terminator)
00623 return AUTOMINUSMINUS;
00624 else
00625 return MINUSMINUS;
00626 } else if (c1 == '=' && c2 == '=') {
00627 shift(2);
00628 return EQEQ;
00629 } else if (c1 == '+' && c2 == '=') {
00630 shift(2);
00631 return PLUSEQUAL;
00632 } else if (c1 == '-' && c2 == '=') {
00633 shift(2);
00634 return MINUSEQUAL;
00635 } else if (c1 == '*' && c2 == '=') {
00636 shift(2);
00637 return MULTEQUAL;
00638 } else if (c1 == '/' && c2 == '=') {
00639 shift(2);
00640 return DIVEQUAL;
00641 } else if (c1 == '&' && c2 == '=') {
00642 shift(2);
00643 return ANDEQUAL;
00644 } else if (c1 == '^' && c2 == '=') {
00645 shift(2);
00646 return XOREQUAL;
00647 } else if (c1 == '%' && c2 == '=') {
00648 shift(2);
00649 return MODEQUAL;
00650 } else if (c1 == '|' && c2 == '=') {
00651 shift(2);
00652 return OREQUAL;
00653 } else if (c1 == '<' && c2 == '<') {
00654 shift(2);
00655 return LSHIFT;
00656 } else if (c1 == '>' && c2 == '>') {
00657 shift(2);
00658 return RSHIFT;
00659 } else if (c1 == '&' && c2 == '&') {
00660 shift(2);
00661 return AND;
00662 } else if (c1 == '|' && c2 == '|') {
00663 shift(2);
00664 return OR;
00665 }
00666
00667 switch(c1) {
00668 case '=':
00669 case '>':
00670 case '<':
00671 case ',':
00672 case '!':
00673 case '~':
00674 case '?':
00675 case ':':
00676 case '.':
00677 case '+':
00678 case '-':
00679 case '*':
00680 case '/':
00681 case '&':
00682 case '|':
00683 case '^':
00684 case '%':
00685 case '(':
00686 case ')':
00687 case '{':
00688 case '}':
00689 case '[':
00690 case ']':
00691 case ';':
00692 shift(1);
00693 return static_cast<int>(c1);
00694 default:
00695 return -1;
00696 }
00697 }
00698
00699 unsigned short Lexer::singleEscape(unsigned short c) const
00700 {
00701 switch(c) {
00702 case 'b':
00703 return 0x08;
00704 case 't':
00705 return 0x09;
00706 case 'n':
00707 return 0x0A;
00708 case 'v':
00709 return 0x0B;
00710 case 'f':
00711 return 0x0C;
00712 case 'r':
00713 return 0x0D;
00714 case '"':
00715 return 0x22;
00716 case '\'':
00717 return 0x27;
00718 case '\\':
00719 return 0x5C;
00720 default:
00721 return c;
00722 }
00723 }
00724
00725 unsigned short Lexer::convertOctal(unsigned short c1, unsigned short c2,
00726 unsigned short c3) const
00727 {
00728 return ((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
00729 }
00730
00731 unsigned char Lexer::convertHex(unsigned short c)
00732 {
00733 if (c >= '0' && c <= '9')
00734 return (c - '0');
00735 else if (c >= 'a' && c <= 'f')
00736 return (c - 'a' + 10);
00737 else
00738 return (c - 'A' + 10);
00739 }
00740
00741 unsigned char Lexer::convertHex(unsigned short c1, unsigned short c2)
00742 {
00743 return ((convertHex(c1) << 4) + convertHex(c2));
00744 }
00745
00746 UChar Lexer::convertUnicode(unsigned short c1, unsigned short c2,
00747 unsigned short c3, unsigned short c4)
00748 {
00749 return UChar((convertHex(c1) << 4) + convertHex(c2),
00750 (convertHex(c3) << 4) + convertHex(c4));
00751 }
00752
00753 void Lexer::record8(unsigned short c)
00754 {
00755 assert(c <= 0xff);
00756
00757
00758 if (pos8 >= size8 - 1) {
00759 char *tmp = new char[2 * size8];
00760 memcpy(tmp, buffer8, size8 * sizeof(char));
00761 delete [] buffer8;
00762 buffer8 = tmp;
00763 size8 *= 2;
00764 }
00765
00766 buffer8[pos8++] = (char) c;
00767 }
00768
00769 void Lexer::record16(UChar c)
00770 {
00771
00772 if (pos16 >= size16 - 1) {
00773 UChar *tmp = new UChar[2 * size16];
00774 memcpy(tmp, buffer16, size16 * sizeof(UChar));
00775 delete [] buffer16;
00776 buffer16 = tmp;
00777 size16 *= 2;
00778 }
00779
00780 buffer16[pos16++] = c;
00781 }
00782
00783 bool Lexer::scanRegExp()
00784 {
00785 pos16 = 0;
00786 bool lastWasEscape = false;
00787 bool inBrackets = false;
00788
00789 while (1) {
00790 if (current == '\r' || current == '\n' || current == 0)
00791 return false;
00792 else if (current != '/' || lastWasEscape == true || inBrackets == true)
00793 {
00794
00795 if ( !lastWasEscape ) {
00796 if ( current == '[' && !inBrackets )
00797 inBrackets = true;
00798 if ( current == ']' && inBrackets )
00799 inBrackets = false;
00800 }
00801 record16(current);
00802 lastWasEscape =
00803 !lastWasEscape && (current == '\\');
00804 }
00805 else {
00806 pattern = UString(buffer16, pos16);
00807 pos16 = 0;
00808 shift(1);
00809 break;
00810 }
00811 shift(1);
00812 }
00813
00814 while (isIdentLetter(current)) {
00815 record16(current);
00816 shift(1);
00817 }
00818 flags = UString(buffer16, pos16);
00819
00820 return true;
00821 }