%{ // Copyright (c) 2011 CZ.NIC z.s.p.o. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // blame: jnml, labs.nic.cz package main import ( "bufio" "go/token" "io" "unicode" "fmt" "bytes" "github.com/cznic/golex/lex" ) // Allocate Character classes anywhere in [0x80, 0xFF]. const ( classUnicodeLeter = iota + 0x80 classUnicodeDigit classOther ) var sc int const ( INITIAL = iota PHP STRING STRING_VAR STRING_VAR_INDEX STRING_VAR_NAME PROPERTY HEREDOC_END NOWDOC HEREDOC BACKQUOTE ) type lexer struct { *lex.Lexer } var stateStack = []int{PHP} var heredocLabel []byte func pushState(state int) { sc = state stateStack = append(stateStack, state) } func popState() { len := len(stateStack) if len <= 1 { return } sc = stateStack[len - 2] stateStack = stateStack[:len-1] } func begin(state int) { len := len(stateStack) stateStack = stateStack[:len-1] stateStack = append(stateStack, state) sc = state } func rune2Class(r rune) int { if r >= 0 && r < 0x80 { // Keep ASCII as it is. return int(r) } if unicode.IsLetter(r) { return classUnicodeLeter } if unicode.IsDigit(r) { return classUnicodeDigit } // return classOther return -1 } func newLexer(src io.Reader, dst io.Writer, fName string) *lexer { file := token.NewFileSet().AddFile(fName, -1, 1<<31-1) lx, err := lex.New(file, bufio.NewReader(src), lex.RuneClass(rune2Class)) if (err != nil) { panic(err) } return &lexer{lx} } func (l *lexer) unget(r rune) []byte{ l.Unget(l.Lookahead()) chars := l.Token(); lastChar := chars[len(chars)-1]; if lastChar.Rune != r { return l.TokenBytes(nil) } l.Unget(lastChar); buf := l.TokenBytes(nil) buf = buf[:len(buf)-1] return buf } func (l *lexer) ungetN(n int) []byte{ l.Unget(l.Lookahead()) chars := l.Token(); for i := 1; i <= n; i++ { char := chars[len(chars)-i]; l.Unget(char); } buf := l.TokenBytes(nil) buf = buf[:len(buf)-n] return buf } func (l *lexer) Lex(lval *yySymType) int { // Lex(lval *yySymType) c := l.Enter() %} %s PHP STRING STRING_VAR STRING_VAR_INDEX STRING_VAR_NAME PROPERTY HEREDOC_END NOWDOC HEREDOC BACKQUOTE %yyb last == '\n' || last = '\0' %yyt sc %yyc c %yyn c = l.Next() %yym l.Mark() LNUM [0-9]+ DNUM ([0-9]*"."[0-9]+)|([0-9]+"."[0-9]*) HNUM 0x[0-9a-fA-F]+ BNUM 0b[01]+ EXPONENT_DNUM (({LNUM}|{DNUM})[eE][+-]?{LNUM}) VAR_NAME [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]* OPERATORS [;:,.\[\]()|\/\^&\+-*=%!~$<>?@] NEW_LINE (\r|\n|\r\n) %% c = l.Rule0() [ \t\n\r]+ . \<\?php([ \t]|{NEW_LINE}) begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; \<\? begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; \<\?= begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG_WITH_ECHO; [ \t\n\r]+ //lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE \?\>{NEW_LINE}? begin(INITIAL);lval.token = string(l.TokenBytes(nil)); return T_CLOSE_TAG; {DNUM}|{EXPONENT_DNUM} lval.token = string(l.TokenBytes(nil)); return T_DNUMBER {BNUM} tb := l.TokenBytes(nil) i:=2 BNUMFOR:for { switch tb[i] { case '0': i++; default: break BNUMFOR; } } if len(tb) - i < 64 { lval.token = string(l.TokenBytes(nil)); return T_LNUMBER } else { lval.token = string(l.TokenBytes(nil)); return T_DNUMBER } {LNUM} if len(l.TokenBytes(nil)) < 20 { lval.token = string(l.TokenBytes(nil)); return T_LNUMBER } else { lval.token = string(l.TokenBytes(nil)); return T_DNUMBER } {HNUM} tb := l.TokenBytes(nil) i:=2 HNUMFOR:for { switch tb[i] { case '0': i++; default: break HNUMFOR; } } length := len(tb) - i if length < 16 || (length == 16 && tb[i] <= '7') { lval.token = string(l.TokenBytes(nil)); return T_LNUMBER } else { lval.token = string(l.TokenBytes(nil)); return T_DNUMBER } abstract lval.token = string(l.TokenBytes(nil)); return T_ABSTRACT array lval.token = string(l.TokenBytes(nil)); return T_ARRAY as lval.token = string(l.TokenBytes(nil)); return T_AS break lval.token = string(l.TokenBytes(nil)); return T_BREAK callable lval.token = string(l.TokenBytes(nil)); return T_CALLABLE case lval.token = string(l.TokenBytes(nil)); return T_CASE catch lval.token = string(l.TokenBytes(nil)); return T_CATCH class lval.token = string(l.TokenBytes(nil)); return T_CLASS clone lval.token = string(l.TokenBytes(nil)); return T_CLONE const lval.token = string(l.TokenBytes(nil)); return T_CONST; continue lval.token = string(l.TokenBytes(nil)); return T_CONTINUE; declare lval.token = string(l.TokenBytes(nil)); return T_DECLARE; default lval.token = string(l.TokenBytes(nil)); return T_DEFAULT; do lval.token = string(l.TokenBytes(nil)); return T_DO; echo lval.token = string(l.TokenBytes(nil)); return T_ECHO; else lval.token = string(l.TokenBytes(nil)); return T_ELSE; elseif lval.token = string(l.TokenBytes(nil)); return T_ELSEIF; empty lval.token = string(l.TokenBytes(nil)); return T_EMPTY; endfor lval.token = string(l.TokenBytes(nil)); return T_ENDFOR endforeach lval.token = string(l.TokenBytes(nil)); return T_ENDFOREACH endif lval.token = string(l.TokenBytes(nil)); return T_ENDIF endswitch lval.token = string(l.TokenBytes(nil)); return T_ENDSWITCH endwhile lval.token = string(l.TokenBytes(nil)); return T_ENDWHILE eval lval.token = string(l.TokenBytes(nil)); return T_EVAL exit|die lval.token = string(l.TokenBytes(nil)); return T_EXIT extends lval.token = string(l.TokenBytes(nil)); return T_EXTENDS final lval.token = string(l.TokenBytes(nil)); return T_FINAL finally lval.token = string(l.TokenBytes(nil)); return T_FINALLY for lval.token = string(l.TokenBytes(nil)); return T_FOR foreach lval.token = string(l.TokenBytes(nil)); return T_FOREACH function|cfunction lval.token = string(l.TokenBytes(nil)); return T_FUNCTION global lval.token = string(l.TokenBytes(nil)); return T_GLOBAL goto lval.token = string(l.TokenBytes(nil)); return T_GOTO if lval.token = string(l.TokenBytes(nil)); return T_IF isset lval.token = string(l.TokenBytes(nil)); return T_ISSET implements lval.token = string(l.TokenBytes(nil)); return T_IMPLEMENTS instanceof lval.token = string(l.TokenBytes(nil)); return T_INSTANCEOF insteadof lval.token = string(l.TokenBytes(nil)); return T_INSTEADOF interface lval.token = string(l.TokenBytes(nil)); return T_INTERFACE list lval.token = string(l.TokenBytes(nil)); return T_LIST namespace lval.token = string(l.TokenBytes(nil)); return T_NAMESPACE private lval.token = string(l.TokenBytes(nil)); return T_PRIVATE public lval.token = string(l.TokenBytes(nil)); return T_PUBLIC print lval.token = string(l.TokenBytes(nil)); return T_PRINT protected lval.token = string(l.TokenBytes(nil)); return T_PROTECTED return lval.token = string(l.TokenBytes(nil)); return T_RETURN static lval.token = string(l.TokenBytes(nil)); return T_STATIC switch lval.token = string(l.TokenBytes(nil)); return T_SWITCH throw lval.token = string(l.TokenBytes(nil)); return T_THROW trait lval.token = string(l.TokenBytes(nil)); return T_TRAIT try lval.token = string(l.TokenBytes(nil)); return T_TRY unset lval.token = string(l.TokenBytes(nil)); return T_UNSET use lval.token = string(l.TokenBytes(nil)); return T_USE var lval.token = string(l.TokenBytes(nil)); return T_VAR while lval.token = string(l.TokenBytes(nil)); return T_WHILE yield[ \t\n\r]+from[^a-zA-Z0-9_\x80-\xff] lval.token = string(l.TokenBytes(nil)); return T_YIELD_FROM yield lval.token = string(l.TokenBytes(nil)); return T_YIELD include lval.token = string(l.TokenBytes(nil)); return T_INCLUDE include_once lval.token = string(l.TokenBytes(nil)); return T_INCLUDE_ONCE require lval.token = string(l.TokenBytes(nil)); return T_REQUIRE require_once lval.token = string(l.TokenBytes(nil)); return T_REQUIRE_ONCE __CLASS__ lval.token = string(l.TokenBytes(nil)); return T_CLASS_C __DIR__ lval.token = string(l.TokenBytes(nil)); return T_DIR __FILE__ lval.token = string(l.TokenBytes(nil)); return T_FILE __FUNCTION__ lval.token = string(l.TokenBytes(nil)); return T_FUNC_C __LINE__ lval.token = string(l.TokenBytes(nil)); return T_LINE __NAMESPACE__ lval.token = string(l.TokenBytes(nil)); return T_NS_C __METHOD__ lval.token = string(l.TokenBytes(nil)); return T_METHOD_C __TRAIT__ lval.token = string(l.TokenBytes(nil)); return T_TRAIT_C __halt_compiler lval.token = string(l.TokenBytes(nil)); return T_HALT_COMPILER \([ \t]*array[ \t]*\) lval.token = string(l.TokenBytes(nil)); return T_ARRAY_CAST \([ \t]*(bool|boolean)[ \t]*\) lval.token = string(l.TokenBytes(nil)); return T_BOOL_CAST \([ \t]*(real|double|float)[ \t]*\) lval.token = string(l.TokenBytes(nil)); return T_DOUBLE_CAST \([ \t]*(int|integer)[ \t]*\) lval.token = string(l.TokenBytes(nil)); return T_INT_CAST \([ \t]*object[ \t]*\) lval.token = string(l.TokenBytes(nil)); return T_OBJECT_CAST \([ \t]*string[ \t]*\) lval.token = string(l.TokenBytes(nil)); return T_STRING_CAST \([ \t]*unset[ \t]*\) lval.token = string(l.TokenBytes(nil)); return T_UNSET_CAST new lval.token = string(l.TokenBytes(nil)); return T_NEW and lval.token = string(l.TokenBytes(nil)); return T_LOGICAL_AND or lval.token = string(l.TokenBytes(nil)); return T_LOGICAL_OR xor lval.token = string(l.TokenBytes(nil)); return T_LOGICAL_XOR \\ lval.token = string(l.TokenBytes(nil)); return T_NS_SEPARATOR \.\.\. lval.token = string(l.TokenBytes(nil)); return T_ELLIPSIS; :: lval.token = string(l.TokenBytes(nil)); return T_PAAMAYIM_NEKUDOTAYIM; // T_DOUBLE_COLON && lval.token = string(l.TokenBytes(nil)); return T_BOOLEAN_AND \|\| lval.token = string(l.TokenBytes(nil)); return T_BOOLEAN_OR &= lval.token = string(l.TokenBytes(nil)); return T_AND_EQUAL \|= lval.token = string(l.TokenBytes(nil)); return T_OR_EQUAL \.= lval.token = string(l.TokenBytes(nil)); return T_CONCAT_EQUAL; \*= lval.token = string(l.TokenBytes(nil)); return T_MUL_EQUAL \*\*= lval.token = string(l.TokenBytes(nil)); return T_POW_EQUAL [/]= lval.token = string(l.TokenBytes(nil)); return T_DIV_EQUAL; \+= lval.token = string(l.TokenBytes(nil)); return T_PLUS_EQUAL -= lval.token = string(l.TokenBytes(nil)); return T_MINUS_EQUAL \^= lval.token = string(l.TokenBytes(nil)); return T_XOR_EQUAL %= lval.token = string(l.TokenBytes(nil)); return T_MOD_EQUAL -- lval.token = string(l.TokenBytes(nil)); return T_DEC; \+\+ lval.token = string(l.TokenBytes(nil)); return T_INC => lval.token = string(l.TokenBytes(nil)); return T_DOUBLE_ARROW; \<=\> lval.token = string(l.TokenBytes(nil)); return T_SPACESHIP \!=|\<\> lval.token = string(l.TokenBytes(nil)); return T_IS_NOT_EQUAL \!== lval.token = string(l.TokenBytes(nil)); return T_IS_NOT_IDENTICAL == lval.token = string(l.TokenBytes(nil)); return T_IS_EQUAL === lval.token = string(l.TokenBytes(nil)); return T_IS_IDENTICAL \<\<= lval.token = string(l.TokenBytes(nil)); return T_SL_EQUAL \>\>= lval.token = string(l.TokenBytes(nil)); return T_SR_EQUAL \>= lval.token = string(l.TokenBytes(nil)); return T_IS_GREATER_OR_EQUAL \<= lval.token = string(l.TokenBytes(nil)); return T_IS_SMALLER_OR_EQUAL \*\* lval.token = string(l.TokenBytes(nil)); return T_POW \<\< lval.token = string(l.TokenBytes(nil)); return T_SL \>\> lval.token = string(l.TokenBytes(nil)); return T_SR \?\? lval.token = string(l.TokenBytes(nil)); return T_COALESCE (#|[/][/]){NEW_LINE} lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> [/][*][^*]*[*]+([^*/][^*]*[*]+)*[/] lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> [/][*][*][^*]*[*]+([^*/][^*]*[*]+)*[/] lval.token = string(l.TokenBytes(nil)); return T_DOC_COMMENT; // TODO: handle ?> '[^']*(\\')*' lval.token = string(l.TokenBytes(nil)); return T_CONSTANT_ENCAPSED_STRING {OPERATORS} lval.token = string(l.TokenBytes(nil)); return rune2Class(l.Prev.Rune) \{ pushState(PHP); lval.token = string(l.TokenBytes(nil)); return rune2Class(l.Prev.Rune) \} popState(); lval.token = string(l.TokenBytes(nil)); return rune2Class(l.Prev.Rune) \${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE {VAR_NAME} if c == -1 {fmt.Printf("%q\n", string(l.TokenBytes(nil)))};lval.token = string(l.TokenBytes(nil)); return T_STRING -> begin(PROPERTY);lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR; [ \t\n\r]+ lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE; -> lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR; {VAR_NAME} begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_STRING; . l.ungetN(1);begin(PHP) [\']([^\\\']*([\\][\'])*)*[\'] lval.token = string(l.TokenBytes(nil)); return T_CONSTANT_ENCAPSED_STRING; ` begin(BACKQUOTE); lval.token = string(l.TokenBytes(nil)); return rune2Class(l.Prev.Rune) ` begin(PHP); lval.token = string(l.TokenBytes(nil)); return rune2Class(l.Prev.Rune) [b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["])){NEW_LINE} tb := l.TokenBytes(nil) binPrefix := 0 if tb[0] == 'b' { binPrefix = 1 } lblFirst := 3 + binPrefix lblLast := len(tb)-2 if tb[lblLast] == '\r' { lblLast-- } for { if tb[lblFirst] == ' ' || tb[lblFirst] == '\t' { lblFirst++ continue } break } switch tb[lblFirst] { case '\'' : lblFirst++ lblLast-- begin(NOWDOC) case '"' : lblFirst++ lblLast-- begin(HEREDOC) default: begin(HEREDOC) } heredocLabel = make([]byte, lblLast - lblFirst + 1) copy(heredocLabel, tb[lblFirst:lblLast+1]) ungetCnt := len(heredocLabel) searchLabelAhead := []byte{} for i := 0; i < len(heredocLabel); i++ { if c == -1 { break; } searchLabelAhead = append(searchLabelAhead, byte(rune(c))) c = l.Next() } if bytes.Equal(heredocLabel, searchLabelAhead) && ';' == rune(c) { ungetCnt++ c = l.Next() if '\n' == rune(c) || '\r' == rune(c) { begin(HEREDOC_END) } } l.ungetN(ungetCnt) lval.token = string(tb); return T_START_HEREDOC . searchLabel := []byte{} tb := []byte{} for { if c == -1 { break; } if '\n' == rune(c) || '\r' == rune(c) { if bytes.Equal(append(heredocLabel, ';'), searchLabel) { begin(HEREDOC_END) tb = l.ungetN(len(heredocLabel)+1) break; } searchLabel = []byte{} } else { searchLabel = append(searchLabel, byte(rune(c))) } c = l.Next() } lval.token = string(tb); return T_ENCAPSED_AND_WHITESPACE {VAR_NAME}\; begin(PHP);lval.token = string(l.ungetN(1)); return T_END_HEREDOC [b]?[\"] binPrefix := l.TokenBytes(nil)[0] == 'b' beginString := func() int { cnt := 1; if (binPrefix) {cnt = 2} l.ungetN(len(l.TokenBytes(nil))-cnt) tokenBytes := l.TokenBytes(nil)[:cnt] fmt.Println(string(tokenBytes)) // TODO: RETURN TOKEN pushState(STRING) lval.token = string(tokenBytes); return rune2Class('"') } F:for { if c == -1 { break; } switch c { case '"' : c = l.Next(); lval.token = string(l.TokenBytes(nil)); return T_CONSTANT_ENCAPSED_STRING break F; case '$': c = l.Next(); if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { return beginString() break F; } l.ungetN(0) case '{': c = l.Next(); if rune(c) == '$' { return beginString() break F; } l.ungetN(0) case '\\': c = l.Next(); } c = l.Next() } \" popState(); lval.token = "\""; return rune2Class(l.Prev.Rune) \{\$ lval.token = string(l.ungetN(1)); return T_CURLY_OPEN \$\{ pushState(STRING_VAR_NAME);lval.token = string(l.TokenBytes(nil)); return T_DOLLAR_OPEN_CURLY_BRACES \$ l.ungetN(1);pushState(STRING_VAR) . F1:for { if c == -1 { break; } switch c { case '"' : lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE break F1; case '$': c = l.Next(); if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { l.ungetN(1) tb := l.TokenBytes(nil) lval.token = string(tb[:len(tb)-1]); return T_ENCAPSED_AND_WHITESPACE break F1; } l.ungetN(0) case '{': c = l.Next(); if rune(c) == '$' { l.ungetN(1) tb := l.TokenBytes(nil) lval.token = string(tb[:len(tb)-1]); return T_ENCAPSED_AND_WHITESPACE break F1; } l.ungetN(0) case '\\': c = l.Next(); } c = l.Next() } . F2:for { if c == -1 { break; } switch c { case '`' : lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE break F2; case '$': c = l.Next(); if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { l.ungetN(1) tb := l.TokenBytes(nil) lval.token = string(tb[:len(tb)-1]); return T_ENCAPSED_AND_WHITESPACE break F2; } l.ungetN(0) case '{': c = l.Next(); if rune(c) == '$' { l.ungetN(1) tb := l.TokenBytes(nil) lval.token = string(tb[:len(tb)-1]); return T_ENCAPSED_AND_WHITESPACE break F2; } l.ungetN(0) case '\\': c = l.Next(); } c = l.Next() } .|[ \t\n\r] searchLabel := []byte{} tb := []byte{} HEREDOCFOR:for { if c == -1 { break; } switch c { case '\n': fallthrough case '\r': if bytes.Equal(append(heredocLabel, ';'), searchLabel) { // TODO handle ';' as optional begin(HEREDOC_END) tb = l.ungetN(len(heredocLabel)+1) break HEREDOCFOR; } searchLabel = []byte{} case '$': c = l.Next(); if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { tb = l.ungetN(1) break HEREDOCFOR; } l.ungetN(0) searchLabel = []byte{} case '{': c = l.Next(); if rune(c) == '$' { tb = l.ungetN(1) break HEREDOCFOR; } l.ungetN(0) searchLabel = []byte{} case '\\': c = l.Next(); searchLabel = []byte{} default: searchLabel = append(searchLabel, byte(rune(c))) } c = l.Next() } lval.token = string(tb); return T_ENCAPSED_AND_WHITESPACE \${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE ->{VAR_NAME} lval.token = string(l.ungetN(len(l.TokenBytes(nil))-2)); return T_OBJECT_OPERATOR {VAR_NAME} popState();lval.token = string(l.TokenBytes(nil)); return T_STRING \[ pushState(STRING_VAR_INDEX);lval.token = string(l.TokenBytes(nil)); return rune2Class(l.Prev.Rune) .|[ \t\n\r] l.ungetN(1);popState() {LNUM}|{HNUM}|{BNUM} lval.token = string(l.TokenBytes(nil)); return T_NUM_STRING \${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE {VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_STRING \] popState(); popState();lval.token = string(l.TokenBytes(nil)); return rune2Class(l.Prev.Rune) [ \n\r\t\\'#] popState(); popState();lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE {OPERATORS} lval.token = string(l.TokenBytes(nil)); return rune2Class(l.Prev.Rune) . lval.token = string(l.TokenBytes(nil)); return rune2Class(l.Prev.Rune) {VAR_NAME}[\[\}] popState();pushState(PHP);lval.token = string(l.ungetN(1)); return T_STRING_VARNAME . l.ungetN(1);popState();pushState(PHP) %% if c, ok := l.Abort(); ok { return int(c) } goto yyAction }