diff --git a/lexer.go b/lexer.go index 2ca82e1..2fea9af 100644 --- a/lexer.go +++ b/lexer.go @@ -4,7 +4,6 @@ import ( "bufio" "go/token" "io" - "unicode" "github.com/cznic/golex/lex" ) @@ -16,50 +15,9 @@ const ( classOther ) -var sc int - type lexer struct { *lex.Lexer -} - -var stateStack = []int{PHP} -var heredocLabel []byte - -func pushState(state int) { - sc = state - stateStack = append(stateStack, state) -} - -func popState() { - len := len(stateStack) - if len <= 1 { - return - } - - sc = stateStack[len-2] - stateStack = stateStack[:len-1] -} - -func begin(state int) { - len := len(stateStack) - stateStack = stateStack[:len-1] - stateStack = append(stateStack, state) - - sc = state -} - -func rune2Class(r rune) int { - if r >= 0 && r < 0x80 { // Keep ASCII as it is. - return int(r) - } - if unicode.IsLetter(r) { - return classUnicodeLeter - } - if unicode.IsDigit(r) { - return classUnicodeDigit - } - // return classOther - return -1 + stateStack []int } func newLexer(src io.Reader, dst io.Writer, fName string) *lexer { @@ -68,25 +26,7 @@ func newLexer(src io.Reader, dst io.Writer, fName string) *lexer { if err != nil { panic(err) } - return &lexer{lx} -} - -func (l *lexer) unget(r rune) []byte { - l.Unget(l.Lookahead()) - - chars := l.Token() - lastChar := chars[len(chars)-1] - - if lastChar.Rune != r { - return l.TokenBytes(nil) - } - - l.Unget(lastChar) - - buf := l.TokenBytes(nil) - buf = buf[:len(buf)-1] - - return buf + return &lexer{lx, []int{0}} } func (l *lexer) ungetN(n int) []byte { @@ -104,3 +44,26 @@ func (l *lexer) ungetN(n int) []byte { return buf } + +func (l *lexer) pushState(state int) { + l.stateStack = append(l.stateStack, state) +} + +func (l *lexer) popState() { + len := len(l.stateStack) + if len <= 1 { + return + } + + l.stateStack = l.stateStack[:len-1] +} + +func (l *lexer) begin(state int) { + len := len(l.stateStack) + l.stateStack = l.stateStack[:len-1] + l.stateStack = append(l.stateStack, state) +} + +func (l *lexer) getCurrentState() int { + return l.stateStack[len(l.stateStack)-1] +} diff --git a/main.go b/main.go index 9e96ba6..edc1d99 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "bytes" "os" + "unicode" ) const src = ` @@ -24,3 +25,17 @@ func main() { l := newLexer(bytes.NewBufferString(src), os.Stdout, "file.name") yyParse(l) } + +func rune2Class(r rune) int { + if r >= 0 && r < 0x80 { // Keep ASCII as it is. + return int(r) + } + if unicode.IsLetter(r) { + return classUnicodeLeter + } + if unicode.IsDigit(r) { + return classUnicodeDigit + } + // return classOther + return -1 +} diff --git a/scanner.go b/scanner.go index db7c9df..328a11c 100644 --- a/scanner.go +++ b/scanner.go @@ -27,6 +27,8 @@ const ( BACKQUOTE ) +var heredocLabel []byte + func (l *lexer) Lex(lval *yySymType) int { // Lex(lval *yySymType) c := l.Enter() @@ -35,7 +37,7 @@ yystate0: _ = yyrule c = l.Rule0() - switch yyt := sc; yyt { + switch yyt := l.getCurrentState(); yyt { default: panic(fmt.Errorf(`invalid start condition %d`, yyt)) case 0: // start condition: INITIAL @@ -7680,17 +7682,17 @@ yyrule2: // . } yyrule3: // \<\?php([ \t]|{NEW_LINE}) { - begin(PHP) //lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; + l.begin(PHP) //lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; goto yystate0 } yyrule4: // \<\? { - begin(PHP) //lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; + l.begin(PHP) //lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; goto yystate0 } yyrule5: // \<\?= { - begin(PHP) + l.begin(PHP) lval.token = string(l.TokenBytes(nil)) return T_OPEN_TAG_WITH_ECHO goto yystate0 @@ -7702,7 +7704,7 @@ yyrule6: // [ \t\n\r]+ } yyrule7: // \?\>{NEW_LINE}? { - begin(INITIAL) //lval.token = string(l.TokenBytes(nil)); return T_CLOSE_TAG; + l.begin(INITIAL) //lval.token = string(l.TokenBytes(nil)); return T_CLOSE_TAG; goto yystate0 } yyrule8: // {DNUM}|{EXPONENT_DNUM} @@ -8477,14 +8479,14 @@ yyrule129: // {OPERATORS} } yyrule130: // \{ { - pushState(PHP) + l.pushState(PHP) lval.token = string(l.TokenBytes(nil)) return rune2Class(rune(l.TokenBytes(nil)[0])) goto yystate0 } yyrule131: // \} { - popState() + l.popState() lval.token = string(l.TokenBytes(nil)) return rune2Class(rune(l.TokenBytes(nil)[0])) goto yystate0 @@ -8503,7 +8505,7 @@ yyrule133: // {VAR_NAME} } yyrule134: // -> { - begin(PROPERTY) + l.begin(PROPERTY) lval.token = string(l.TokenBytes(nil)) return T_OBJECT_OPERATOR goto yystate0 @@ -8522,7 +8524,7 @@ yyrule136: // -> } yyrule137: // {VAR_NAME} { - begin(PHP) + l.begin(PHP) lval.token = string(l.TokenBytes(nil)) return T_STRING goto yystate0 @@ -8530,7 +8532,7 @@ yyrule137: // {VAR_NAME} yyrule138: // . { l.ungetN(1) - begin(PHP) + l.begin(PHP) goto yystate0 } yyrule139: // [\']([^\\\']*([\\][\'])*)*[\'] @@ -8541,14 +8543,14 @@ yyrule139: // [\']([^\\\']*([\\][\'])*)*[\'] } yyrule140: // ` { - begin(BACKQUOTE) + l.begin(BACKQUOTE) lval.token = string(l.TokenBytes(nil)) rune2Class(rune(l.TokenBytes(nil)[0])) goto yystate0 } yyrule141: // ` { - begin(PHP) + l.begin(PHP) lval.token = string(l.TokenBytes(nil)) rune2Class(rune(l.TokenBytes(nil)[0])) goto yystate0 @@ -8578,13 +8580,13 @@ yyrule142: // [b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["])) case '\'': lblFirst++ lblLast-- - begin(NOWDOC) + l.begin(NOWDOC) case '"': lblFirst++ lblLast-- - begin(HEREDOC) + l.begin(HEREDOC) default: - begin(HEREDOC) + l.begin(HEREDOC) } heredocLabel = make([]byte, lblLast-lblFirst+1) copy(heredocLabel, tb[lblFirst:lblLast+1]) @@ -8602,7 +8604,7 @@ yyrule142: // [b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["])) ungetCnt++ c = l.Next() if '\n' == rune(c) || '\r' == rune(c) { - begin(HEREDOC_END) + l.begin(HEREDOC_END) } } l.ungetN(ungetCnt) @@ -8622,7 +8624,7 @@ yyrule143: // . } if '\n' == rune(c) || '\r' == rune(c) { if bytes.Equal(append(heredocLabel, ';'), searchLabel) { - begin(HEREDOC_END) + l.begin(HEREDOC_END) tb = l.ungetN(len(heredocLabel) + 1) break } @@ -8639,7 +8641,7 @@ yyrule143: // . } yyrule144: // {VAR_NAME}\; { - begin(PHP) + l.begin(PHP) lval.token = string(l.ungetN(1)) return T_END_HEREDOC goto yystate0 @@ -8656,7 +8658,7 @@ yyrule145: // [b]?[\"] l.ungetN(len(l.TokenBytes(nil)) - cnt) tokenBytes := l.TokenBytes(nil)[:cnt] - pushState(STRING) + l.pushState(STRING) lval.token = string(tokenBytes) return rune2Class('"') } @@ -8696,7 +8698,7 @@ yyrule145: // [b]?[\"] } yyrule146: // \" { - popState() + l.popState() lval.token = "\"" return rune2Class(rune(l.TokenBytes(nil)[0])) goto yystate0 @@ -8704,13 +8706,13 @@ yyrule146: // \" yyrule147: // \{\$ { lval.token = string(l.ungetN(1)) - pushState(PHP) + l.pushState(PHP) return T_CURLY_OPEN goto yystate0 } yyrule148: // \$\{ { - pushState(STRING_VAR_NAME) + l.pushState(STRING_VAR_NAME) lval.token = string(l.TokenBytes(nil)) return T_DOLLAR_OPEN_CURLY_BRACES goto yystate0 @@ -8718,7 +8720,7 @@ yyrule148: // \$\{ yyrule149: // \$ { l.ungetN(1) - pushState(STRING_VAR) + l.pushState(STRING_VAR) goto yystate0 } yyrule150: // . @@ -8821,7 +8823,7 @@ yyrule152: // .|[ \t\n\r] fallthrough case '\r': if bytes.Equal(append(heredocLabel, ';'), searchLabel) { // TODO handle ';' as optional - begin(HEREDOC_END) + l.begin(HEREDOC_END) tb = l.ungetN(len(heredocLabel) + 1) break HEREDOCFOR } @@ -8873,14 +8875,14 @@ yyrule154: // ->{VAR_NAME} } yyrule155: // {VAR_NAME} { - popState() + l.popState() lval.token = string(l.TokenBytes(nil)) return T_STRING goto yystate0 } yyrule156: // \[ { - pushState(STRING_VAR_INDEX) + l.pushState(STRING_VAR_INDEX) lval.token = string(l.TokenBytes(nil)) return rune2Class(rune(l.TokenBytes(nil)[0])) goto yystate0 @@ -8888,7 +8890,7 @@ yyrule156: // \[ yyrule157: // .|[ \t\n\r] { l.ungetN(1) - popState() + l.popState() goto yystate0 } yyrule158: // {LNUM}|{HNUM}|{BNUM} @@ -8911,16 +8913,16 @@ yyrule160: // {VAR_NAME} } yyrule161: // \] { - popState() - popState() + l.popState() + l.popState() lval.token = string(l.TokenBytes(nil)) return rune2Class(rune(l.TokenBytes(nil)[0])) goto yystate0 } yyrule162: // [ \n\r\t\\'#] { - popState() - popState() + l.popState() + l.popState() lval.token = string(l.TokenBytes(nil)) return T_ENCAPSED_AND_WHITESPACE goto yystate0 @@ -8939,8 +8941,8 @@ yyrule164: // . } yyrule165: // {VAR_NAME}[\[\}] { - popState() - pushState(PHP) + l.popState() + l.pushState(PHP) lval.token = string(l.ungetN(1)) return T_STRING_VARNAME goto yystate0 @@ -8948,8 +8950,8 @@ yyrule165: // {VAR_NAME}[\[\}] yyrule166: // . { l.ungetN(1) - popState() - pushState(PHP) + l.popState() + l.pushState(PHP) goto yystate0 } panic("unreachable") diff --git a/scanner.l b/scanner.l index 9b9f826..7bde821 100644 --- a/scanner.l +++ b/scanner.l @@ -13,19 +13,21 @@ import ( ) const ( - INITIAL = iota - PHP - STRING - STRING_VAR - STRING_VAR_INDEX - STRING_VAR_NAME - PROPERTY - HEREDOC_END - NOWDOC - HEREDOC - BACKQUOTE + INITIAL = iota + PHP + STRING + STRING_VAR + STRING_VAR_INDEX + STRING_VAR_NAME + PROPERTY + HEREDOC_END + NOWDOC + HEREDOC + BACKQUOTE ) +var heredocLabel []byte + func (l *lexer) Lex(lval *yySymType) int { // Lex(lval *yySymType) c := l.Enter() @@ -34,7 +36,7 @@ func (l *lexer) Lex(lval *yySymType) int { // Lex(lval *yySymType) %s PHP STRING STRING_VAR STRING_VAR_INDEX STRING_VAR_NAME PROPERTY HEREDOC_END NOWDOC HEREDOC BACKQUOTE %yyb last == '\n' || last = '\0' -%yyt sc +%yyt l.getCurrentState() %yyc c %yyn c = l.Next() %yym l.Mark() @@ -77,12 +79,12 @@ NEW_LINE (\r|\n|\r\n) lval.token = string(tb); return T_INLINE_HTML -\<\?php([ \t]|{NEW_LINE}) begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; -\<\? begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; -\<\?= begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG_WITH_ECHO; +\<\?php([ \t]|{NEW_LINE}) l.begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; +\<\? l.begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; +\<\?= l.begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG_WITH_ECHO; [ \t\n\r]+ //lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE -\?\>{NEW_LINE}? begin(INITIAL);//lval.token = string(l.TokenBytes(nil)); return T_CLOSE_TAG; +\?\>{NEW_LINE}? l.begin(INITIAL);//lval.token = string(l.TokenBytes(nil)); return T_CLOSE_TAG; {DNUM}|{EXPONENT_DNUM} lval.token = string(l.TokenBytes(nil)); return T_DNUMBER {BNUM} @@ -240,21 +242,21 @@ NEW_LINE (\r|\n|\r\n) '[^']*(\\')*' lval.token = string(l.TokenBytes(nil)); return T_CONSTANT_ENCAPSED_STRING {OPERATORS} lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) -\{ pushState(PHP); lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) -\} popState(); lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) +\{ l.pushState(PHP); lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) +\} l.popState(); lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) \${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE {VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_STRING --> begin(PROPERTY);lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR; +-> l.begin(PROPERTY);lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR; [ \t\n\r]+ lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE; -> lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR; -{VAR_NAME} begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_STRING; -. l.ungetN(1);begin(PHP) +{VAR_NAME} l.begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_STRING; +. l.ungetN(1);l.begin(PHP) [\']([^\\\']*([\\][\'])*)*[\'] lval.token = string(l.TokenBytes(nil)); return T_CONSTANT_ENCAPSED_STRING; -` begin(BACKQUOTE); lval.token = string(l.TokenBytes(nil)); rune2Class(rune(l.TokenBytes(nil)[0])) -` begin(PHP); lval.token = string(l.TokenBytes(nil)); rune2Class(rune(l.TokenBytes(nil)[0])) +` l.begin(BACKQUOTE); lval.token = string(l.TokenBytes(nil)); rune2Class(rune(l.TokenBytes(nil)[0])) +` l.begin(PHP); lval.token = string(l.TokenBytes(nil)); rune2Class(rune(l.TokenBytes(nil)[0])) [b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["])){NEW_LINE} tb := l.TokenBytes(nil) @@ -282,13 +284,13 @@ NEW_LINE (\r|\n|\r\n) case '\'' : lblFirst++ lblLast-- - begin(NOWDOC) + l.begin(NOWDOC) case '"' : lblFirst++ lblLast-- - begin(HEREDOC) + l.begin(HEREDOC) default: - begin(HEREDOC) + l.begin(HEREDOC) } heredocLabel = make([]byte, lblLast - lblFirst + 1) @@ -308,7 +310,7 @@ NEW_LINE (\r|\n|\r\n) ungetCnt++ c = l.Next() if '\n' == rune(c) || '\r' == rune(c) { - begin(HEREDOC_END) + l.begin(HEREDOC_END) } } @@ -327,7 +329,7 @@ NEW_LINE (\r|\n|\r\n) if '\n' == rune(c) || '\r' == rune(c) { if bytes.Equal(append(heredocLabel, ';'), searchLabel) { - begin(HEREDOC_END) + l.begin(HEREDOC_END) tb = l.ungetN(len(heredocLabel)+1) break; } @@ -342,7 +344,7 @@ NEW_LINE (\r|\n|\r\n) lval.token = string(tb); return T_ENCAPSED_AND_WHITESPACE -{VAR_NAME}\; begin(PHP);lval.token = string(l.ungetN(1)); return T_END_HEREDOC +{VAR_NAME}\; l.begin(PHP);lval.token = string(l.ungetN(1)); return T_END_HEREDOC [b]?[\"] binPrefix := l.TokenBytes(nil)[0] == 'b' @@ -352,7 +354,7 @@ NEW_LINE (\r|\n|\r\n) l.ungetN(len(l.TokenBytes(nil))-cnt) tokenBytes := l.TokenBytes(nil)[:cnt] - pushState(STRING) + l.pushState(STRING) lval.token = string(tokenBytes); return rune2Class('"') } @@ -391,10 +393,10 @@ NEW_LINE (\r|\n|\r\n) c = l.Next() } -\" popState(); lval.token = "\""; return rune2Class(rune(l.TokenBytes(nil)[0])) -\{\$ lval.token = string(l.ungetN(1)); pushState(PHP); return T_CURLY_OPEN -\$\{ pushState(STRING_VAR_NAME);lval.token = string(l.TokenBytes(nil)); return T_DOLLAR_OPEN_CURLY_BRACES -\$ l.ungetN(1);pushState(STRING_VAR) +\" l.popState(); lval.token = "\""; return rune2Class(rune(l.TokenBytes(nil)[0])) +\{\$ lval.token = string(l.ungetN(1)); l.pushState(PHP); return T_CURLY_OPEN +\$\{ l.pushState(STRING_VAR_NAME);lval.token = string(l.TokenBytes(nil)); return T_DOLLAR_OPEN_CURLY_BRACES +\$ l.ungetN(1);l.pushState(STRING_VAR) . F1:for { if c == -1 { @@ -484,7 +486,7 @@ NEW_LINE (\r|\n|\r\n) case '\n': fallthrough case '\r': if bytes.Equal(append(heredocLabel, ';'), searchLabel) { // TODO handle ';' as optional - begin(HEREDOC_END) + l.begin(HEREDOC_END) tb = l.ungetN(len(heredocLabel)+1) break HEREDOCFOR; } @@ -524,20 +526,20 @@ NEW_LINE (\r|\n|\r\n) \${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE ->{VAR_NAME} lval.token = string(l.ungetN(len(l.TokenBytes(nil))-2)); return T_OBJECT_OPERATOR -{VAR_NAME} popState();lval.token = string(l.TokenBytes(nil)); return T_STRING -\[ pushState(STRING_VAR_INDEX);lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) -.|[ \t\n\r] l.ungetN(1);popState() +{VAR_NAME} l.popState();lval.token = string(l.TokenBytes(nil)); return T_STRING +\[ l.pushState(STRING_VAR_INDEX);lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) +.|[ \t\n\r] l.ungetN(1);l.popState() {LNUM}|{HNUM}|{BNUM} lval.token = string(l.TokenBytes(nil)); return T_NUM_STRING \${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE {VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_STRING -\] popState(); popState();lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) -[ \n\r\t\\'#] popState(); popState();lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE +\] l.popState(); l.popState();lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) +[ \n\r\t\\'#] l.popState(); l.popState();lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE {OPERATORS} lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) . lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) -{VAR_NAME}[\[\}] popState();pushState(PHP);lval.token = string(l.ungetN(1)); return T_STRING_VARNAME -. l.ungetN(1);popState();pushState(PHP) +{VAR_NAME}[\[\}] l.popState();l.pushState(PHP);lval.token = string(l.ungetN(1)); return T_STRING_VARNAME +. l.ungetN(1);l.popState();l.pushState(PHP) %% if c, ok := l.Abort(); ok { return int(c) }