%{ // Copyright (c) 2011 CZ.NIC z.s.p.o. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // blame: jnml, labs.nic.cz package scanner import ( "fmt" "github.com/cznic/golex/lex" ) const ( INITIAL = iota PHP STRING STRING_VAR STRING_VAR_INDEX STRING_VAR_NAME PROPERTY HEREDOC_END NOWDOC HEREDOC BACKQUOTE ) func isValidFirstVarNameRune(r rune) bool { return r >= 'A' && r <= 'Z' || r == '_' || r >= 'a' && r <= 'z' || r >= '\u007f' && r <= 'ÿ' } func (l *Lexer) Lex(lval Lval) int { l.Comments = nil c := l.Enter() %} %s PHP STRING STRING_VAR STRING_VAR_INDEX STRING_VAR_NAME PROPERTY HEREDOC_END NOWDOC HEREDOC BACKQUOTE %yyb last == '\n' || last = '\0' %yyt l.getCurrentState() %yyc c %yyn c = l.Next() %yym l.Mark() %optioncase-insensitive LNUM [0-9]+ DNUM ([0-9]*"."[0-9]+)|([0-9]+"."[0-9]*) HNUM 0x[0-9a-fA-F]+ BNUM 0b[01]+ EXPONENT_DNUM (({LNUM}|{DNUM})[eE][+-]?{LNUM}) VAR_NAME [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]* OPERATORS [;:,.\[\]()|\/\^&\+-*=%!~$<>?@] NEW_LINE (\r|\n|\r\n) %% c = l.Rule0() [ \t\n\r]+ . tb := []lex.Char{} for { if c == -1 { tb = l.Token(); break; } if '?' == rune(c) { tb = l.Token(); if (len(tb) < 2 || tb[len(tb)-1].Rune != '<') { c = l.Next() continue; } tb = l.ungetChars(1) break; } c = l.Next() } lval.Token(l.createToken(tb)) return T_INLINE_HTML \<\?php([ \t]|{NEW_LINE}) l.begin(PHP); \<\? l.begin(PHP); \<\?= l.begin(PHP);lval.Token(l.createToken(l.Token())); return T_ECHO; [ \t\n\r]+ [;][ \t\n\r]*\?\>{NEW_LINE}? l.begin(INITIAL);lval.Token(l.createToken(l.Token())); return Rune2Class(';'); \?\>{NEW_LINE}? l.begin(INITIAL);lval.Token(l.createToken(l.Token())); return Rune2Class(';'); {DNUM}|{EXPONENT_DNUM} lval.Token(l.createToken(l.Token())); return T_DNUMBER {BNUM} tb := l.Token() i:=2 BNUMFOR:for { if i > len(tb)-1 { break BNUMFOR; } switch tb[i].Rune { case '0': i++; default: break BNUMFOR; } } if len(tb) - i < 64 { lval.Token(l.createToken(l.Token())); return T_LNUMBER } else { lval.Token(l.createToken(l.Token())); return T_DNUMBER } {LNUM} if len(l.Token()) < 20 { lval.Token(l.createToken(l.Token())); return T_LNUMBER } else { lval.Token(l.createToken(l.Token())); return T_DNUMBER } {HNUM} tb := l.Token() i:=2 HNUMFOR:for { if i > len(tb)-1 { break HNUMFOR; } switch tb[i].Rune { case '0': i++; default: break HNUMFOR; } } length := len(tb) - i if length < 16 || (length == 16 && tb[i].Rune <= '7') { lval.Token(l.createToken(l.Token())); return T_LNUMBER } else { lval.Token(l.createToken(l.Token())); return T_DNUMBER } abstract lval.Token(l.createToken(l.Token())); return T_ABSTRACT array lval.Token(l.createToken(l.Token())); return T_ARRAY as lval.Token(l.createToken(l.Token())); return T_AS break lval.Token(l.createToken(l.Token())); return T_BREAK callable lval.Token(l.createToken(l.Token())); return T_CALLABLE case lval.Token(l.createToken(l.Token())); return T_CASE catch lval.Token(l.createToken(l.Token())); return T_CATCH class lval.Token(l.createToken(l.Token())); return T_CLASS clone lval.Token(l.createToken(l.Token())); return T_CLONE const lval.Token(l.createToken(l.Token())); return T_CONST continue lval.Token(l.createToken(l.Token())); return T_CONTINUE declare lval.Token(l.createToken(l.Token())); return T_DECLARE default lval.Token(l.createToken(l.Token())); return T_DEFAULT do lval.Token(l.createToken(l.Token())); return T_DO echo lval.Token(l.createToken(l.Token())); return T_ECHO else lval.Token(l.createToken(l.Token())); return T_ELSE elseif lval.Token(l.createToken(l.Token())); return T_ELSEIF empty lval.Token(l.createToken(l.Token())); return T_EMPTY enddeclare lval.Token(l.createToken(l.Token())); return T_ENDDECLARE endfor lval.Token(l.createToken(l.Token())); return T_ENDFOR endforeach lval.Token(l.createToken(l.Token())); return T_ENDFOREACH endif lval.Token(l.createToken(l.Token())); return T_ENDIF endswitch lval.Token(l.createToken(l.Token())); return T_ENDSWITCH endwhile lval.Token(l.createToken(l.Token())); return T_ENDWHILE eval lval.Token(l.createToken(l.Token())); return T_EVAL exit|die lval.Token(l.createToken(l.Token())); return T_EXIT extends lval.Token(l.createToken(l.Token())); return T_EXTENDS final lval.Token(l.createToken(l.Token())); return T_FINAL finally lval.Token(l.createToken(l.Token())); return T_FINALLY for lval.Token(l.createToken(l.Token())); return T_FOR foreach lval.Token(l.createToken(l.Token())); return T_FOREACH function|cfunction lval.Token(l.createToken(l.Token())); return T_FUNCTION global lval.Token(l.createToken(l.Token())); return T_GLOBAL goto lval.Token(l.createToken(l.Token())); return T_GOTO if lval.Token(l.createToken(l.Token())); return T_IF isset lval.Token(l.createToken(l.Token())); return T_ISSET implements lval.Token(l.createToken(l.Token())); return T_IMPLEMENTS instanceof lval.Token(l.createToken(l.Token())); return T_INSTANCEOF insteadof lval.Token(l.createToken(l.Token())); return T_INSTEADOF interface lval.Token(l.createToken(l.Token())); return T_INTERFACE list lval.Token(l.createToken(l.Token())); return T_LIST namespace lval.Token(l.createToken(l.Token())); return T_NAMESPACE private lval.Token(l.createToken(l.Token())); return T_PRIVATE public lval.Token(l.createToken(l.Token())); return T_PUBLIC print lval.Token(l.createToken(l.Token())); return T_PRINT protected lval.Token(l.createToken(l.Token())); return T_PROTECTED return lval.Token(l.createToken(l.Token())); return T_RETURN static lval.Token(l.createToken(l.Token())); return T_STATIC switch lval.Token(l.createToken(l.Token())); return T_SWITCH throw lval.Token(l.createToken(l.Token())); return T_THROW trait lval.Token(l.createToken(l.Token())); return T_TRAIT try lval.Token(l.createToken(l.Token())); return T_TRY unset lval.Token(l.createToken(l.Token())); return T_UNSET use lval.Token(l.createToken(l.Token())); return T_USE var lval.Token(l.createToken(l.Token())); return T_VAR while lval.Token(l.createToken(l.Token())); return T_WHILE yield[ \t\n\r]+from[^a-zA-Z0-9_\x80-\xff] lval.Token(l.createToken(l.Token())); return T_YIELD_FROM yield lval.Token(l.createToken(l.Token())); return T_YIELD include lval.Token(l.createToken(l.Token())); return T_INCLUDE include_once lval.Token(l.createToken(l.Token())); return T_INCLUDE_ONCE require lval.Token(l.createToken(l.Token())); return T_REQUIRE require_once lval.Token(l.createToken(l.Token())); return T_REQUIRE_ONCE __CLASS__ lval.Token(l.createToken(l.Token())); return T_CLASS_C __DIR__ lval.Token(l.createToken(l.Token())); return T_DIR __FILE__ lval.Token(l.createToken(l.Token())); return T_FILE __FUNCTION__ lval.Token(l.createToken(l.Token())); return T_FUNC_C __LINE__ lval.Token(l.createToken(l.Token())); return T_LINE __NAMESPACE__ lval.Token(l.createToken(l.Token())); return T_NS_C __METHOD__ lval.Token(l.createToken(l.Token())); return T_METHOD_C __TRAIT__ lval.Token(l.createToken(l.Token())); return T_TRAIT_C __halt_compiler lval.Token(l.createToken(l.Token())); return T_HALT_COMPILER \([ \t]*array[ \t]*\) lval.Token(l.createToken(l.Token())); return T_ARRAY_CAST \([ \t]*(bool|boolean)[ \t]*\) lval.Token(l.createToken(l.Token())); return T_BOOL_CAST \([ \t]*(real|double|float)[ \t]*\) lval.Token(l.createToken(l.Token())); return T_DOUBLE_CAST \([ \t]*(int|integer)[ \t]*\) lval.Token(l.createToken(l.Token())); return T_INT_CAST \([ \t]*object[ \t]*\) lval.Token(l.createToken(l.Token())); return T_OBJECT_CAST \([ \t]*(string|binary)[ \t]*\) lval.Token(l.createToken(l.Token())); return T_STRING_CAST \([ \t]*unset[ \t]*\) lval.Token(l.createToken(l.Token())); return T_UNSET_CAST new lval.Token(l.createToken(l.Token())); return T_NEW and lval.Token(l.createToken(l.Token())); return T_LOGICAL_AND or lval.Token(l.createToken(l.Token())); return T_LOGICAL_OR xor lval.Token(l.createToken(l.Token())); return T_LOGICAL_XOR \\ lval.Token(l.createToken(l.Token())); return T_NS_SEPARATOR \.\.\. lval.Token(l.createToken(l.Token())); return T_ELLIPSIS :: lval.Token(l.createToken(l.Token())); return T_PAAMAYIM_NEKUDOTAYIM // T_DOUBLE_COLON && lval.Token(l.createToken(l.Token())); return T_BOOLEAN_AND \|\| lval.Token(l.createToken(l.Token())); return T_BOOLEAN_OR &= lval.Token(l.createToken(l.Token())); return T_AND_EQUAL \|= lval.Token(l.createToken(l.Token())); return T_OR_EQUAL \.= lval.Token(l.createToken(l.Token())); return T_CONCAT_EQUAL \*= lval.Token(l.createToken(l.Token())); return T_MUL_EQUAL \*\*= lval.Token(l.createToken(l.Token())); return T_POW_EQUAL [/]= lval.Token(l.createToken(l.Token())); return T_DIV_EQUAL \+= lval.Token(l.createToken(l.Token())); return T_PLUS_EQUAL -= lval.Token(l.createToken(l.Token())); return T_MINUS_EQUAL \^= lval.Token(l.createToken(l.Token())); return T_XOR_EQUAL %= lval.Token(l.createToken(l.Token())); return T_MOD_EQUAL -- lval.Token(l.createToken(l.Token())); return T_DEC \+\+ lval.Token(l.createToken(l.Token())); return T_INC => lval.Token(l.createToken(l.Token())); return T_DOUBLE_ARROW \<=\> lval.Token(l.createToken(l.Token())); return T_SPACESHIP \!=|\<\> lval.Token(l.createToken(l.Token())); return T_IS_NOT_EQUAL \!== lval.Token(l.createToken(l.Token())); return T_IS_NOT_IDENTICAL == lval.Token(l.createToken(l.Token())); return T_IS_EQUAL === lval.Token(l.createToken(l.Token())); return T_IS_IDENTICAL \<\<= lval.Token(l.createToken(l.Token())); return T_SL_EQUAL \>\>= lval.Token(l.createToken(l.Token())); return T_SR_EQUAL \>= lval.Token(l.createToken(l.Token())); return T_IS_GREATER_OR_EQUAL \<= lval.Token(l.createToken(l.Token())); return T_IS_SMALLER_OR_EQUAL \*\* lval.Token(l.createToken(l.Token())); return T_POW \<\< lval.Token(l.createToken(l.Token())); return T_SL \>\> lval.Token(l.createToken(l.Token())); return T_SR \?\? lval.Token(l.createToken(l.Token())); return T_COALESCE (#|[/][/]) tb := l.Token() for { if c == -1 { break } tb = append(tb, l.Last) switch c { case '\r': c = l.Next() if c == '\n' { continue } case '\n': case '?': c = l.Next() if c == '>' { l.ungetChars(1) tb = tb[:len(tb)-1] break } continue default: c = l.Next() continue } break; } l.addComment(tb) ([/][*])|([/][*][*]) tb := l.Token() is_doc_comment := false if len(tb) > 2 { is_doc_comment = true l.PhpDocComment = "" } c = l.Next() for { if c == -1 { break; // TODO: Unterminated comment starting line %d } if l.Prev.Rune == '*' && l.Last.Rune == '/' { c = l.Next() break; } c = l.Next() } if is_doc_comment { l.PhpDocComment = string(l.TokenBytes(nil)) l.addComment(l.Token()) } else { l.addComment(l.Token()) } {OPERATORS} lval.Token(l.createToken(l.Token())); return Rune2Class(rune(l.TokenBytes(nil)[0])) \{ l.pushState(PHP); lval.Token(l.createToken(l.Token())); return Rune2Class(rune(l.TokenBytes(nil)[0])) \} l.popState(); lval.Token(l.createToken(l.Token())); return Rune2Class(rune(l.TokenBytes(nil)[0])); l.PhpDocComment = "" \${VAR_NAME} lval.Token(l.createToken(l.Token())); return T_VARIABLE {VAR_NAME} lval.Token(l.createToken(l.Token())); return T_STRING -> l.begin(PROPERTY);lval.Token(l.createToken(l.Token())); return T_OBJECT_OPERATOR; [ \t\n\r]+ -> lval.Token(l.createToken(l.Token())); return T_OBJECT_OPERATOR; {VAR_NAME} l.begin(PHP);lval.Token(l.createToken(l.Token())); return T_STRING; . l.ungetChars(1);l.begin(PHP) [\']([^\\\']*([\\].)*)*[\'] lval.Token(l.createToken(l.Token())); return T_CONSTANT_ENCAPSED_STRING; ` l.begin(BACKQUOTE); lval.Token(l.createToken(l.Token())); return Rune2Class(rune(l.TokenBytes(nil)[0])) ` l.begin(PHP); lval.Token(l.createToken(l.Token())); return Rune2Class(rune(l.TokenBytes(nil)[0])) [b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["])){NEW_LINE} tb := l.Token() binPrefix := 0 if tb[0].Rune == 'b' { binPrefix = 1 } lblFirst := 3 + binPrefix lblLast := len(tb)-2 if tb[lblLast].Rune == '\r' { lblLast-- } for { if tb[lblFirst].Rune == ' ' || tb[lblFirst].Rune == '\t' { lblFirst++ continue } break } heredocToken := make([]lex.Char, lblLast - lblFirst + 1) copy(heredocToken, tb[lblFirst:lblLast+1]) switch tb[lblFirst].Rune { case '\'' : lblFirst++ lblLast-- l.begin(NOWDOC) case '"' : lblFirst++ lblLast-- l.begin(HEREDOC) default: l.begin(HEREDOC) } l.heredocLabel = l.tokenString(tb[lblFirst:lblLast+1]) ungetCnt := len(l.heredocLabel) searchLabelAhead := []byte{} for i := 0; i < len(l.heredocLabel); i++ { if c == -1 { break; } searchLabelAhead = append(searchLabelAhead, byte(rune(c))) c = l.Next() } if l.heredocLabel == string(searchLabelAhead) && ';' == rune(c) { ungetCnt++ c = l.Next() if '\n' == rune(c) || '\r' == rune(c) { l.begin(HEREDOC_END) } } l.ungetChars(ungetCnt) lval.Token(l.createToken(heredocToken)); return T_START_HEREDOC .|[ \t\n\r] searchLabel := []byte{} tb := []lex.Char{} for { if c == -1 { break; } if '\n' == rune(c) || '\r' == rune(c) { if l.heredocLabel + ";" == string(searchLabel) { l.begin(HEREDOC_END) tb = l.ungetChars(len(l.heredocLabel)+1) break; } if l.heredocLabel == string(searchLabel) { l.begin(HEREDOC_END) tb = l.ungetChars(len(l.heredocLabel)) break; } searchLabel = []byte{} } else { searchLabel = append(searchLabel, byte(rune(c))) } c = l.Next() } lval.Token(l.createToken(tb) ) return T_ENCAPSED_AND_WHITESPACE {VAR_NAME}\; l.begin(PHP);lval.Token(l.createToken(l.ungetChars(1))); return T_END_HEREDOC {VAR_NAME} l.begin(PHP);lval.Token(l.createToken(l.Token())); return T_END_HEREDOC [b]?[\"] binPrefix := l.Token()[0].Rune == 'b' beginString := func() int { cnt := 1; if (binPrefix) {cnt = 2} l.ungetChars(len(l.Token())-cnt) chars := l.Token()[:cnt] l.pushState(STRING) lval.Token(l.createToken(chars)); return Rune2Class('"') } F:for { if c == -1 { break; } switch c { case '"' : c = l.Next(); lval.Token(l.createToken(l.Token())); return T_CONSTANT_ENCAPSED_STRING break F; case '$': c = l.Next(); if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { return beginString() break F; } l.ungetChars(0) case '{': c = l.Next(); if rune(c) == '$' { return beginString() break F; } l.ungetChars(0) case '\\': c = l.Next(); } c = l.Next() } \" l.popState(); lval.Token(l.createToken(l.Token())); return Rune2Class(l.Token()[0].Rune) \{\$ lval.Token(l.createToken(l.ungetChars(1))); l.pushState(PHP); return T_CURLY_OPEN \$\{ l.pushState(STRING_VAR_NAME); lval.Token(l.createToken(l.Token())); return T_DOLLAR_OPEN_CURLY_BRACES \${VAR_NAME} l.ungetChars(len(l.Token()));l.pushState(STRING_VAR) .|[ \t\n\r] currentChar := l.Prev tb := []lex.Char{currentChar} for { switch currentChar.Rune { case '$': if c == '{' || isValidFirstVarNameRune(rune(c)) { l.ungetChars(1) lval.Token(l.createToken(tb[:len(tb)-1])); return T_ENCAPSED_AND_WHITESPACE } case '{': if rune(c) == '$' { l.ungetChars(1) lval.Token(l.createToken(tb[:len(tb)-1])); return T_ENCAPSED_AND_WHITESPACE } case '\\': currentChar := l.Last tb = append(tb, currentChar) c = l.Next(); } if rune(c) == '"' { lval.Token(l.createToken(l.Token())); return T_ENCAPSED_AND_WHITESPACE } currentChar = l.Last tb = append(tb, currentChar) c = l.Next() if c == -1 { break; } } .|[ \t\n\r] currentChar := l.Prev tb := []lex.Char{currentChar} for { switch currentChar.Rune { case '$': if c == '{' || isValidFirstVarNameRune(rune(c)) { l.ungetChars(1) lval.Token(l.createToken(tb[:len(tb)-1])); return T_ENCAPSED_AND_WHITESPACE } case '{': if rune(c) == '$' { l.ungetChars(1) lval.Token(l.createToken(tb[:len(tb)-1])); return T_ENCAPSED_AND_WHITESPACE } case '\\': currentChar := l.Last tb = append(tb, currentChar) c = l.Next(); } if rune(c) == '`' { lval.Token(l.createToken(l.Token())); return T_ENCAPSED_AND_WHITESPACE } currentChar = l.Last tb = append(tb, currentChar) c = l.Next() if c == -1 { break; } } .|[ \t\n\r] searchLabel := []byte{} tb := []lex.Char{} for { if c == -1 { break; } nls := 0 switch c { case '\r': nls = 1 c := l.Next() if c != '\n' { nls = 0 l.ungetChars(0) } fallthrough case '\n': if l.heredocLabel + ";" == string(searchLabel) { l.begin(HEREDOC_END) tb = l.ungetChars(len(l.heredocLabel)+1+nls) lval.Token(l.createToken(tb)); return T_ENCAPSED_AND_WHITESPACE } if l.heredocLabel == string(searchLabel) { l.begin(HEREDOC_END) tb = l.ungetChars(len(l.heredocLabel)+nls) lval.Token(l.createToken(tb)); return T_ENCAPSED_AND_WHITESPACE } searchLabel = []byte{} case '$': c = l.Next(); if rune(c) == '{' || isValidFirstVarNameRune(rune(c)) { tb = l.ungetChars(1) lval.Token(l.createToken(tb)); return T_ENCAPSED_AND_WHITESPACE } l.ungetChars(0) case '{': c = l.Next(); if rune(c) == '$' { tb = l.ungetChars(1) lval.Token(l.createToken(tb)); return T_ENCAPSED_AND_WHITESPACE } l.ungetChars(0) case '\\': c = l.Next(); if c == '\n' || c == '\r' { l.ungetChars(0) } default: searchLabel = append(searchLabel, byte(rune(c))) } c = l.Next() } \${VAR_NAME} lval.Token(l.createToken(l.Token())); return T_VARIABLE ->{VAR_NAME} lval.Token(l.createToken(l.ungetChars(len(l.Token())-2))); return T_OBJECT_OPERATOR {VAR_NAME} l.popState();lval.Token(l.createToken(l.Token())); return T_STRING \[ l.pushState(STRING_VAR_INDEX);lval.Token(l.createToken(l.Token())); return Rune2Class(rune(l.TokenBytes(nil)[0])) .|[ \t\n\r] l.ungetChars(1);l.popState() {LNUM}|{HNUM}|{BNUM} lval.Token(l.createToken(l.Token())); return T_NUM_STRING \${VAR_NAME} lval.Token(l.createToken(l.Token())); return T_VARIABLE {VAR_NAME} lval.Token(l.createToken(l.Token())); return T_STRING \] l.popState(); l.popState();lval.Token(l.createToken(l.Token())); return Rune2Class(rune(l.TokenBytes(nil)[0])) [ \n\r\t\\'#] l.popState(); l.popState();lval.Token(l.createToken(l.Token())); return T_ENCAPSED_AND_WHITESPACE {OPERATORS} lval.Token(l.createToken(l.Token())); return Rune2Class(rune(l.TokenBytes(nil)[0])) . lval.Token(l.createToken(l.Token())); return Rune2Class(rune(l.TokenBytes(nil)[0])) {VAR_NAME}[\[\}] l.popState();l.pushState(PHP);lval.Token(l.createToken(l.ungetChars(1))); return T_STRING_VARNAME . l.ungetChars(1);l.popState();l.pushState(PHP) %% if c, ok := l.Abort(); ok { return int(c) } goto yyAction }