%{ // Copyright (c) 2011 CZ.NIC z.s.p.o. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // blame: jnml, labs.nic.cz package main import ( "bufio" "go/token" "io" "unicode" "fmt" "os" "bytes" "github.com/cznic/golex/lex" ) // Allocate Character classes anywhere in [0x80, 0xFF]. const ( classUnicodeLeter = iota + 0x80 classUnicodeDigit classOther ) var sc int const ( INITIAL = iota PHP STRING STRING_VAR STRING_VAR_INDEX STRING_VAR_NAME PROPERTY HEREDOC_END NOWDOC HEREDOC BACKQUOTE ) type lexer struct { *lex.Lexer } var stateStack = []int{PHP} var heredocLabel []byte func pushState(state int) { sc = state stateStack = append(stateStack, state) } func popState() { len := len(stateStack) if len <= 1 { return } sc = stateStack[len - 2] stateStack = stateStack[:len-1] } func begin(state int) { len := len(stateStack) stateStack = stateStack[:len-1] stateStack = append(stateStack, state) sc = state } func rune2Class(r rune) int { if r >= 0 && r < 0x80 { // Keep ASCII as it is. return int(r) } if unicode.IsLetter(r) { return classUnicodeLeter } if unicode.IsDigit(r) { return classUnicodeDigit } return classOther } func newLexer(src io.Reader, dst io.Writer, fName string) *lexer { file := token.NewFileSet().AddFile(fName, -1, 1<<31-1) lx, err := lex.New(file, bufio.NewReader(src), lex.RuneClass(rune2Class)) if (err != nil) { panic(err) } return &lexer{lx} } type yySymType struct {} func (l *lexer) unget(r rune) []byte{ l.Unget(l.Lookahead()) chars := l.Token(); lastChar := chars[len(chars)-1]; if lastChar.Rune != r { return l.TokenBytes(nil) } l.Unget(lastChar); buf := l.TokenBytes(nil) buf = buf[:len(buf)-1] return buf } func (l *lexer) ungetN(n int) []byte{ l.Unget(l.Lookahead()) chars := l.Token(); for i := 1; i <= n; i++ { char := chars[len(chars)-i]; l.Unget(char); } buf := l.TokenBytes(nil) buf = buf[:len(buf)-n] return buf } func (l *lexer) Lex() int { // Lex(lval *yySymType) c := l.Enter() %} %s PHP STRING STRING_VAR STRING_VAR_INDEX STRING_VAR_NAME PROPERTY HEREDOC_END NOWDOC HEREDOC BACKQUOTE %yyb last == '\n' || last = '\0' %yyt sc %yyc c %yyn c = l.Next() %yym l.Mark() LNUM [0-9]+ DNUM ([0-9]*"."[0-9]+)|([0-9]+"."[0-9]*) HNUM 0x[0-9a-fA-F]+ BNUM 0b[01]+ EXPONENT_DNUM (({LNUM}|{DNUM})[eE][+-]?{LNUM}) VAR_NAME [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]* OPERATORS [;:,.\[\]()|\/\^&\+-*=%!~$<>?@] NEW_LINE (\r|\n|\r\n) %% c = l.Rule0() // ([\$]{NCH})* [ \t\n\r]+ . \<\?|\<\?php fmt.Println("T_OPEN_TAG");begin(PHP) \<\?= fmt.Println("T_OPEN_TAG_WITH_ECHO");begin(PHP) [ \t\n\r]+ fmt.Println("T_WHITESPACE") \?\> fmt.Println("T_CLOSE_TAG");begin(INITIAL) abstract fmt.Println("T_ABSTRACT") array fmt.Println("T_ARRAY") as fmt.Println("T_AS") break fmt.Println("T_BREAK") callable fmt.Println("T_CALLABLE") case fmt.Println("T_CASE") catch fmt.Println("T_CATCH") class fmt.Println("T_CLASS") clone fmt.Println("T_CLONE") const fmt.Println("T_CONST"); continue fmt.Println("T_CONTINUE"); declare fmt.Println("T_DECLARE"); default fmt.Println("T_DEFAULT"); do fmt.Println("T_DO"); echo fmt.Println("T_ECHO"); else fmt.Println("T_ELSE"); elseif fmt.Println("T_ELSEIF"); empty fmt.Println("T_EMPTY"); endfor fmt.Println("T_ENDFOR") endforeach fmt.Println("T_ENDFOREACH") endif fmt.Println("T_ENDIF") endswitch fmt.Println("T_ENDSWITCH") endwhile fmt.Println("T_ENDWHILE") exit|die fmt.Println("T_EXIT") extends fmt.Println("T_EXTENDS") final fmt.Println("T_FINAL") finally fmt.Println("T_FINALLY") for fmt.Println("T_FOR") foreach fmt.Println("T_FOREACH") function|cfunction fmt.Println("T_FUNCTION") global fmt.Println("T_GLOBAL") goto fmt.Println("T_GOTO") if fmt.Println("T_IF") implements fmt.Println("T_IMPLEMENTS") instanceof fmt.Println("T_INSTANCEOF") insteadof fmt.Println("T_INSTEADOF") interface fmt.Println("T_INTERFACE") namespace fmt.Println("T_NAMESPACE") private fmt.Println("T_PRIVATE") public fmt.Println("T_PUBLIC") protected fmt.Println("T_PROTECTED") return fmt.Println("T_RETURN") static fmt.Println("T_STATIC") switch fmt.Println("T_SWITCH") throw fmt.Println("T_THROW") trait fmt.Println("T_TRAIT") try fmt.Println("T_TRY") use fmt.Println("T_USE") var fmt.Println("T_VAR") while fmt.Println("T_WHILE") yield[ \t\n\r]+from[^a-zA-Z0-9_\x80-\xff] fmt.Println("T_YIELD_FROM") yield fmt.Println("T_YIELD") __CLASS__ fmt.Println("T_CLASS_C") __DIR__ fmt.Println("T_DIR") __FILE__ fmt.Println("T_FILE") __FUNCTION__ fmt.Println("T_FUNC_C") __LINE__ fmt.Println("T_LINE") __NAMESPACE__ fmt.Println("T_NS_C") __METHOD__ fmt.Println("T_METHOD_C") __TRAIT__ fmt.Println("T_TRAIT_C") \([ \t]*array[ \t]*\) fmt.Println("T_ARRAY_CAST") \([ \t]*(bool|boolean)[ \t]*\) fmt.Println("T_BOOL_CAST") \([ \t]*(real|double|float)[ \t]*\) fmt.Println("T_DOUBLE_CAST") \([ \t]*(int|integer)[ \t]*\) fmt.Println("T_INT_CAST") \([ \t]*object[ \t]*\) fmt.Println("T_OBJECT_CAST") \([ \t]*string[ \t]*\) fmt.Println("T_STRING_CAST") \([ \t]*unset[ \t]*\) fmt.Println("T_UNSET_CAST") new fmt.Println("T_NEW") and fmt.Println("T_LOGICAL_AND") or fmt.Println("T_LOGICAL_OR") xor fmt.Println("T_LOGICAL_XOR") \\ fmt.Println("T_NS_SEPARATOR") \.\.\. fmt.Println("T_ELLIPSIS"); :: fmt.Println("T_PAAMAYIM_NEKUDOTAYIM"); // T_DOUBLE_COLON && fmt.Println("T_BOOLEAN_AND") \|\| fmt.Println("T_BOOLEAN_OR") &= fmt.Println("T_AND_EQUAL") \|= fmt.Println("T_OR_EQUAL") \.= fmt.Println("T_CONCAT_EQUAL"); \*= fmt.Println("T_MUL_EQUAL") \*\*= fmt.Println("T_POW_EQUAL") [/]= fmt.Println("T_DIV_EQUAL"); \+= fmt.Println("T_PLUS_EQUAL") -= fmt.Println("T_MINUS_EQUAL") \^= fmt.Println("T_XOR_EQUAL") %= fmt.Println("T_MOD_EQUAL") -- fmt.Println("T_DEC"); \+\+ fmt.Println("T_INC") => fmt.Println("T_DOUBLE_ARROW"); \<=\> fmt.Println("T_SPACESHIP") \!=|\<\> fmt.Println("T_IS_NOT_EQUAL") \!== fmt.Println("T_IS_NOT_IDENTICAL") == fmt.Println("T_IS_EQUAL") === fmt.Println("T_IS_IDENTICAL") \<\<= fmt.Println("T_SL_EQUAL") \>\>= fmt.Println("T_SR_EQUAL") \>= fmt.Println("T_IS_GREATER_OR_EQUAL") \<= fmt.Println("T_IS_SMALLER_OR_EQUAL") \*\* fmt.Println("T_POW") \<\< fmt.Println("T_SL") \>\> fmt.Println("T_SR") (#|[/][/]){NEW_LINE} fmt.Println("T_COMMENT"); // TODO: handle \r\n and allow ?> '[^']*(\\')*' fmt.Println("T_CONSTANT_ENCAPSED_STRING") {OPERATORS} fmt.Printf("%s\n", l.TokenBytes(nil)); \} fmt.Println("}"); popState(); \${VAR_NAME} fmt.Printf("T_VARIABLE: %q\n", l.TokenBytes(nil)) -> fmt.Println("T_OBJECT_OPERATOR");begin(PROPERTY) [ \t\n\r]+ fmt.Println("T_WHITESPACE"); -> fmt.Println("T_OBJECT_OPERATOR"); {VAR_NAME} fmt.Println("T_STRING");begin(PHP) . l.ungetN(1);begin(PHP) [\']([^\\\']*([\\][\'])*)*[\'] fmt.Printf("T_CONSTANT_ENCAPSED_STRING: %s\n", l.TokenBytes(nil)); ` fmt.Println("`");begin(BACKQUOTE) ` fmt.Println("`");begin(PHP) [b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["])){NEW_LINE} tb := l.TokenBytes(nil) binPrefix := 0 if tb[0] == 'b' { binPrefix = 1 } lblFirst := 3 + binPrefix lblLast := len(tb)-2 if tb[lblLast] == '\r' { lblLast-- } for { if tb[lblFirst] == ' ' || tb[lblFirst] == '\t' { lblFirst++ continue } break } switch tb[lblFirst] { case '\'' : lblFirst++ lblLast-- begin(NOWDOC) case '"' : lblFirst++ lblLast-- begin(HEREDOC) default: begin(HEREDOC) } heredocLabel = make([]byte, lblLast - lblFirst + 1) copy(heredocLabel, tb[lblFirst:lblLast+1]) ungetCnt := len(heredocLabel) searchLabelAhead := []byte{} for i := 0; i < len(heredocLabel); i++ { if c == -1 { break; } searchLabelAhead = append(searchLabelAhead, byte(rune(c))) c = l.Next() } if bytes.Equal(heredocLabel, searchLabelAhead) && ';' == rune(c) { ungetCnt++ c = l.Next() if '\n' == rune(c) || '\r' == rune(c) { begin(HEREDOC_END) } } l.ungetN(ungetCnt) fmt.Printf("T_START_HEREDOC: %q\n", tb); . searchLabel := []byte{} tb := []byte{} for { if c == -1 { break; } if '\n' == rune(c) || '\r' == rune(c) { if bytes.Equal(append(heredocLabel, ';'), searchLabel) { begin(HEREDOC_END) tb = l.ungetN(len(heredocLabel)+1) break; } searchLabel = []byte{} } else { searchLabel = append(searchLabel, byte(rune(c))) } c = l.Next() } fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %q\n", tb); {VAR_NAME}\; fmt.Printf("T_END_HEREDOC: %q\n", l.ungetN(1));begin(PHP) . fmt.Printf("ERROR HEREDOC: %q\n", l.ungetN(1)); [b]?[\"] binPrefix := l.TokenBytes(nil)[0] == 'b' beginString := func() { cnt := 1; if (binPrefix) {cnt = 2} l.ungetN(len(l.TokenBytes(nil))-cnt) tokenBytes := l.TokenBytes(nil)[:cnt] fmt.Println(string(tokenBytes)) // TODO: RETURN TOKEN pushState(STRING) } F:for { if c == -1 { break; } switch c { case '"' : c = l.Next(); fmt.Printf("T_CONSTANT_ENCAPSED_STRING: %s\n", l.TokenBytes(nil)); break F; case '$': c = l.Next(); if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { beginString() break F; } l.ungetN(0) case '{': c = l.Next(); if rune(c) == '$' { beginString() break F; } l.ungetN(0) case '\\': c = l.Next(); } c = l.Next() } \" fmt.Println("\""); popState() \{\$ fmt.Printf("T_CURLY_OPEN: %q\n", l.ungetN(1));pushState(PHP) \$\{ fmt.Printf("T_DOLLAR_OPEN_CURLY_BRACES: %q\n", l.TokenBytes(nil));pushState(STRING_VAR_NAME) \$ l.ungetN(1);pushState(STRING_VAR) . F1:for { if c == -1 { break; } switch c { case '"' : fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %s\n", l.TokenBytes(nil)); break F1; case '$': c = l.Next(); if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { l.ungetN(1) tb := l.TokenBytes(nil) fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %s\n", tb[:len(tb)-1]); break F1; } l.ungetN(0) case '{': c = l.Next(); if rune(c) == '$' { l.ungetN(1) tb := l.TokenBytes(nil) fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %s\n", tb[:len(tb)-1]); break F1; } l.ungetN(0) case '\\': c = l.Next(); } c = l.Next() } . F2:for { if c == -1 { break; } switch c { case '`' : fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %s\n", l.TokenBytes(nil)); break F2; case '$': c = l.Next(); if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { l.ungetN(1) tb := l.TokenBytes(nil) fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %s\n", tb[:len(tb)-1]); break F2; } l.ungetN(0) case '{': c = l.Next(); if rune(c) == '$' { l.ungetN(1) tb := l.TokenBytes(nil) fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %s\n", tb[:len(tb)-1]); break F2; } l.ungetN(0) case '\\': c = l.Next(); } c = l.Next() } .|[ \t\n\r] searchLabel := []byte{} tb := []byte{} HEREDOCFOR:for { if c == -1 { break; } switch c { case '\n': fallthrough case '\r': if bytes.Equal(append(heredocLabel, ';'), searchLabel) { begin(HEREDOC_END) tb = l.ungetN(len(heredocLabel)+1) break HEREDOCFOR; } searchLabel = []byte{} case '$': c = l.Next(); if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { tb = l.ungetN(1) break HEREDOCFOR; } l.ungetN(0) searchLabel = []byte{} case '{': c = l.Next(); if rune(c) == '$' { tb = l.ungetN(1) break HEREDOCFOR; } l.ungetN(0) searchLabel = []byte{} case '\\': c = l.Next(); searchLabel = []byte{} default: searchLabel = append(searchLabel, byte(rune(c))) } c = l.Next() } fmt.Printf("T_ENCAPSED_AND_WHITESPACE(HEREDOC): %q\n", tb); \${VAR_NAME} fmt.Printf("T_VARIABLE: %q\n", l.TokenBytes(nil)); ->{VAR_NAME} fmt.Printf("T_OBJECT_OPERATOR: %q\n", l.ungetN(len(l.TokenBytes(nil))-2)); {VAR_NAME} fmt.Printf("T_STRING: %q\n", l.TokenBytes(nil));popState() \[ fmt.Println("["); pushState(STRING_VAR_INDEX) .|[ \t\n\r] l.ungetN(1);popState() {LNUM} fmt.Printf("T_NUM_STRING: %q\n", l.TokenBytes(nil)); \${VAR_NAME} fmt.Printf("T_VARIABLE: %q\n", l.TokenBytes(nil)); {VAR_NAME} fmt.Printf("T_STRING: %q\n", l.TokenBytes(nil)); \] fmt.Println("\"]\""); popState(); popState() [ \n\r\t\\'#] fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %q\n", l.ungetN(1)); popState(); popState() {OPERATORS} fmt.Printf("%q\n", l.TokenBytes(nil)); . fmt.Printf("%q\n", l.TokenBytes(nil)); {VAR_NAME}[\[\}] fmt.Printf("T_STRING_VARNAME: %q\n", l.ungetN(1));popState();pushState(PHP) . l.ungetN(1);popState();pushState(PHP) . fmt.Printf("other: %q\n", l.TokenBytes(nil)) %% if c, ok := l.Abort(); ok { return int(c) } goto yyAction } func main() { l := newLexer(os.Stdin, os.Stdout, "file.name") l.Lex(); }