commit 599def13841ef8fdab267c60211728f134e971f5 Author: z7zmey Date: Tue Nov 7 08:21:38 2017 +0200 first commit diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7d43b18 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +# Copyright (c) 2011 CZ.NIC z.s.p.o. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# blame: jnml, labs.nic.cz + +all: php-parser.go + go build + +run: all + ./php-parser + +php-parser.go: php-parser.l + golex -t $< | gofmt > $@ + +clean: + rm -f php-parser.go lex.yy.go y.output *~ + +nuke: clean + rm -f example diff --git a/c-like.l b/c-like.l new file mode 100644 index 0000000..d8accc7 --- /dev/null +++ b/c-like.l @@ -0,0 +1,79 @@ +%{ +// Copyright (c) 2015 The golex Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This is an example program using golex run time library. +package main + +import ( + "bufio" + "go/token" + "io" + "unicode" + + "github.com/cznic/golex/lex" +) + +// Allocate Character classes anywhere in [0x80, 0xFF]. +const ( + classUnicodeLeter = iota + 0x80 + classUnicodeDigit + classOther +) + +type lexer struct { + *lex.Lexer +} + +func rune2Class(r rune) int { + if r >= 0 && r < 0x80 { // Keep ASCII as it is. + return int(r) + } + if unicode.IsLetter(r) { + return classUnicodeLeter + } + if unicode.IsDigit(r) { + return classUnicodeDigit + } + return classOther +} + +func newLexer(src io.Reader, dst io.Writer, fName string) *lexer { + file := token.NewFileSet().AddFile(fName, -1, 1<<31-1) + lx, err := lex.New(file, bufio.NewReader(src), lex.RuneClass(rune2Class)) + if (err != nil) { panic(err) } + return &lexer{lx} +} + +func (l *lexer) Lex(lval *yySymType) int { + c := l.Enter() + +%} + +%yyc c +%yyn c = l.Next() +%yym l.Mark() + +digit [0-9]|{unicodeDigit} +identifier {letter}({letter}|{digit})* +int {digit}+ +letter [_a-zA-Z]|{unicodeLetter} +unicodeDigit \x81 +unicodeLetter \x80 +op [-+*/] + +%% + c = l.Rule0() + +[ \t\r\n]+ +[/][/][^\n]+ lval.token = string(l.TokenBytes(nil)); return COMMENT +func lval.token = string(l.TokenBytes(nil)); return FUNC +{identifier} lval.token = string(l.TokenBytes(nil)); return IDENT +{int} lval.token = string(l.TokenBytes(nil)); return INT +{op} lval.token = string(l.TokenBytes(nil)); return OP + +%% + if c, ok := l.Abort(); ok { return int(c) } + goto yyAction +} diff --git a/example.l b/example.l new file mode 100644 index 0000000..193ebfa --- /dev/null +++ b/example.l @@ -0,0 +1,237 @@ +%{ +// Copyright (c) 2011 CZ.NIC z.s.p.o. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// blame: jnml, labs.nic.cz + +package main + +import ( + "bufio" + "fmt" + "os" + "bytes" +) + +var ( + src = bufio.NewReader(os.Stdin) + buf []byte + current byte + isPhp bool + sc int +) + +const ( + INITIAL = iota + PHP +) + +func skipHtml() { + for !isPhp{ + t_open_tag := []byte("\0 return +[ \t\n\r]+ +. +\<\?|\<\?php fmt.Println("T_OPEN_TAG");begin(PHP) +\<\?= fmt.Println("T_OPEN_TAG_WITH_ECHO");begin(PHP) + +. fmt.Printf("%q\n", buf) +[ \t\n\r]+ fmt.Println("T_WHITESPACE") + // \<\?|\<\?php fmt.Println("T_OPEN_TAG"); + // \<\?= fmt.Println("T_OPEN_TAG_WITH_ECHO"); +\?\> fmt.Println("T_CLOSE_TAG");begin(INITIAL) + // abstract fmt.Println("T_ABSTRACT") + // &= fmt.Println("T_AND_EQUAL") + // \(array\) fmt.Println("T_ARRAY_CAST") + // \(bool\)|\(boolean\) fmt.Println("T_BOOL_CAST") + // \(real\)|\(double\)|\(float\) fmt.Println("T_DOUBLE_CAST") + // \(int\)|\(integer\) fmt.Println("T_INT_CAST") + // \(object\) fmt.Println("T_OBJECT_CAST") + // \(string\) fmt.Println("T_STRING_CAST") + // \(unset\) fmt.Println("T_UNSET_CAST") + // array fmt.Println("T_ARRAY") + // as fmt.Println("T_AS") + // && fmt.Println("T_BOOLEAN_AND") + // \|\| fmt.Println("T_BOOLEAN_OR") + // break fmt.Println("T_BREAK") + // callable fmt.Println("T_CALLABLE") + // case fmt.Println("T_CASE") + // catch fmt.Println("T_CATCH") + // class fmt.Println("T_CLASS") + // __CLASS__ fmt.Println("T_CLASS_C") + // __DIR__ fmt.Println("T_DIR") + // clone fmt.Println("T_CLONE") + // [/][/][^\n]+ fmt.Println("T_COMMENT"); // TODO: multiline comment + // \.= fmt.Println("T_CONCAT_EQUAL"); + // [/]= fmt.Println("T_DIV_EQUAL"); + // const fmt.Println("T_CONST"); + // continue fmt.Println("T_CONTINUE"); + // -- fmt.Println("T_DEC"); + // declare fmt.Println("T_DECLARE"); + // default fmt.Println("T_DEFAULT"); + // do fmt.Println("T_DO"); + // {D}\.{D}?|\.{D} fmt.Println("T_DNUMBER"); + // => fmt.Println("T_DOUBLE_ARROW"); + // :: fmt.Println("T_DOUBLE_COLON"); + // echo fmt.Println("T_ECHO"); + // \.\.\. fmt.Println("T_ELLIPSIS"); + // else fmt.Println("T_ELSE"); + // elseif fmt.Println("T_ELSEIF"); + // empty fmt.Println("T_EMPTY"); + // endfor fmt.Println("T_ENDFOR") + // endforeach fmt.Println("T_ENDFOREACH") + // endif fmt.Println("T_ENDIF") + // endswitch fmt.Println("T_ENDSWITCH") + // endwhile fmt.Println("T_ENDWHILE") + // exit|die fmt.Println("T_EXIT") + // extends fmt.Println("T_EXTENDS") + // __FILE__ fmt.Println("T_FILE") + // final fmt.Println("T_FINAL") + // finally fmt.Println("T_FINALLY") + // for fmt.Println("T_FOR") + // foreach fmt.Println("T_FOREACH") + // function|cfunction fmt.Println("T_FUNCTION") + // __FUNCTION__ fmt.Println("T_FUNC_C") + // global fmt.Println("T_GLOBAL") + // goto fmt.Println("T_GOTO") + // if fmt.Println("T_IF") + // implements fmt.Println("T_IMPLEMENTS") + // \+\+ fmt.Println("T_INC") + // instanceof fmt.Println("T_INSTANCEOF") + // insteadof fmt.Println("T_INSTEADOF") + // interface fmt.Println("T_INTERFACE") + // == fmt.Println("T_IS_EQUAL") + // \>= fmt.Println("T_IS_GREATER_OR_EQUAL") + // === fmt.Println("T_IS_IDENTICAL") + // \!=|\<\> fmt.Println("T_IS_NOT_EQUAL") + // \!== fmt.Println("T_IS_NOT_IDENTICAL") + // \<= fmt.Println("T_IS_SMALLER_OR_EQUAL") + // \<=\> fmt.Println("T_SPACESHIP") + // __LINE__ fmt.Println("T_LINE") + // {D} fmt.Println("T_LNUMBER") // TODO: parse 0x1ac + // and fmt.Println("T_LOGICAL_AND") + // or fmt.Println("T_LOGICAL_OR") + // xor fmt.Println("T_LOGICAL_XOR") + // __METHOD__ fmt.Println("T_METHOD_C") + // -= fmt.Println("T_MINUS_EQUAL") + // %= fmt.Println("T_MOD_EQUAL") + // \*= fmt.Println("T_MUL_EQUAL") + // namespace fmt.Println("T_NAMESPACE") + // __NAMESPACE__ fmt.Println("T_NS_C") + // \\ fmt.Println("T_NS_SEPARATOR") + // new fmt.Println("T_NEW") + // -> fmt.Println("T_OBJECT_OPERATOR") + // \|= fmt.Println("T_OR_EQUAL") + // \+= fmt.Println("T_PLUS_EQUAL") + // \*\* fmt.Println("T_POW") + // \*\*= fmt.Println("T_POW_EQUAL") + // private fmt.Println("T_PRIVATE") + // public fmt.Println("T_PUBLIC") + // protected fmt.Println("T_PROTECTED") + // return fmt.Println("T_RETURN") + // \<\< fmt.Println("T_SL") + // \<\<= fmt.Println("T_SL_EQUAL") + // \>\> fmt.Println("T_SR") + // \>\>= fmt.Println("T_SR_EQUAL") + // static fmt.Println("T_STATIC") + // switch fmt.Println("T_SWITCH") + // throw fmt.Println("T_THROW") + // trait fmt.Println("T_TRAIT") + // __TRAIT__ fmt.Println("T_TRAIT_C") + // try fmt.Println("T_TRY") + // use fmt.Println("T_USE") + // var fmt.Println("T_VAR") + // while fmt.Println("T_WHILE") + // \^= fmt.Println("T_XOR_EQUAL") + // yield fmt.Println("T_YIELD") + // yield\nfrom fmt.Println("T_YIELD_FROM") + // '[^']*' fmt.Println("T_CONSTANT_ENCAPSED_STRING") + + // \{\$ fmt.Println("T_CURLY_OPEN");src.UnreadByte();c='$' + // \$[A-Za-z][A-Za-z0-9]* fmt.Println("T_VARIABLE") // TODO allow -_ and other + + // [/][*][^*]*[*]+([^*/][^*]*[*]+)*[/] fmt.Println("T_COMMENT"); + // [/][*] panic("Unterminated comment") + +%% +// \{\$ fmt.Println("T_CURLY_OPEN"); +// \$\{ fmt.Println("T_DOLLAR_OPEN_CURLY_BRACES"); + +// T_DOC_COMMENT /** */ PHPDoc-комментарии +// T_ENCAPSED_AND_WHITESPACE " $a" константная часть строки с переменными +// T_ENDDECLARE enddeclare declare, альтернативный синтаксис +// T_END_HEREDOC синтаксис heredoc +// T_START_HEREDOC <<< синтаксис heredoc +// T_EVAL eval() eval() +// T_HALT_COMPILER __halt_compiler() __halt_compiler (доступно с PHP 5.1.0) +// T_INCLUDE include() include +// T_INCLUDE_ONCE include_once() include_once +// T_INLINE_HTML текст вне PHP +// T_ISSET isset() isset() +// T_LIST list() list() +// T_NUM_STRING "$a[0]" цифровой индекс массива внутри строки +// T_PRINT print() print +// T_REQUIRE require() require +// T_REQUIRE_ONCE require_once() require_once +// T_STRING parent, self и т.п.. идентификаторы, например, ключевые слова вроде parent и self, сюда подходят также имена функций, классов и некоторые другие. Смотрите также T_CONSTANT_ENCAPSED_STRING +// T_STRING_VARNAME "${a переменные внутри строки +// T_UNSET unset() unset() +// T_VARIABLE $foo переменные + +// T_BAD_CHARACTER все, что ниже ASCII 32 исключая \t (0x09), \n (0x0a) и \r (0x0d) +// T_CONSTANT_ENCAPSED_STRING "foo" или 'bar' строковой синтаксис + + // The golex generated scanner enters top of the user code section when + // lexem recongition fails. In this example it should never happen. + fmt.Println("UNDEFENED LEXEM") + goto yystate0 + // panic("scanner internal error") + +} // *1 this right brace diff --git a/php-parser.go b/php-parser.go new file mode 100644 index 0000000..2ac153e --- /dev/null +++ b/php-parser.go @@ -0,0 +1,765 @@ +// Code generated by golex. DO NOT EDIT. + +// Copyright (c) 2011 CZ.NIC z.s.p.o. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// blame: jnml, labs.nic.cz + +package main + +import ( + "bufio" + "fmt" + "go/token" + "io" + "os" + "unicode" + + "github.com/cznic/golex/lex" +) + +// Allocate Character classes anywhere in [0x80, 0xFF]. +const ( + classUnicodeLeter = iota + 0x80 + classUnicodeDigit + classOther +) + +var sc int + +const ( + INITIAL = iota + PHP + STRING +) + +type lexer struct { + *lex.Lexer +} + +func begin(cond int) { + sc = cond +} + +func rune2Class(r rune) int { + if r >= 0 && r < 0x80 { // Keep ASCII as it is. + return int(r) + } + if unicode.IsLetter(r) { + return classUnicodeLeter + } + if unicode.IsDigit(r) { + return classUnicodeDigit + } + return classOther +} + +func newLexer(src io.Reader, dst io.Writer, fName string) *lexer { + file := token.NewFileSet().AddFile(fName, -1, 1<<31-1) + lx, err := lex.New(file, bufio.NewReader(src), lex.RuneClass(rune2Class)) + if err != nil { + panic(err) + } + return &lexer{lx} +} + +type yySymType struct{} + +func (l *lexer) unget(r rune) []byte { + l.Unget(l.Lookahead()) + + chars := l.Token() + lastChar := chars[len(chars)-1] + + if lastChar.Rune != r { + return l.TokenBytes(nil) + } + + l.Unget(lastChar) + + buf := l.TokenBytes(nil) + buf = buf[:len(buf)-1] + + return buf +} + +func (l *lexer) Lex() int { // Lex(lval *yySymType) + c := l.Enter() + +yystate0: + yyrule := -1 + _ = yyrule + c = l.Rule0() + // ([\$]{NCH})* + + switch yyt := sc; yyt { + default: + panic(fmt.Errorf(`invalid start condition %d`, yyt)) + case 0: // start condition: INITIAL + goto yystart1 + case 1: // start condition: PHP + goto yystart11 + case 2: // start condition: STRING + goto yystart32 + } + + goto yystate0 // silence unused label error + goto yyAction // silence unused label error +yyAction: + switch yyrule { + case 1: + goto yyrule1 + case 2: + goto yyrule2 + case 3: + goto yyrule3 + case 4: + goto yyrule4 + case 5: + goto yyrule5 + case 6: + goto yyrule6 + case 7: + goto yyrule7 + case 8: + goto yyrule8 + case 9: + goto yyrule9 + case 10: + goto yyrule10 + case 11: + goto yyrule11 + case 12: + goto yyrule12 + case 13: + goto yyrule13 + case 14: + goto yyrule14 + case 15: + goto yyrule15 + case 16: + goto yyrule16 + } + goto yystate1 // silence unused label error +yystate1: + c = l.Next() +yystart1: + switch { + default: + goto yyabort + case c == '<': + goto yystate5 + case c == '\n': + goto yystate4 + case c == '\t' || c == '\r' || c == ' ': + goto yystate3 + case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c >= '!' && c <= ';' || c >= '=' && c <= 'ÿ': + goto yystate2 + } + +yystate2: + c = l.Next() + yyrule = 2 + l.Mark() + goto yyrule2 + +yystate3: + c = l.Next() + yyrule = 1 + l.Mark() + switch { + default: + goto yyrule1 + case c == '\t' || c == '\n' || c == '\r' || c == ' ': + goto yystate4 + } + +yystate4: + c = l.Next() + yyrule = 1 + l.Mark() + switch { + default: + goto yyrule1 + case c == '\t' || c == '\n' || c == '\r' || c == ' ': + goto yystate4 + } + +yystate5: + c = l.Next() + yyrule = 2 + l.Mark() + switch { + default: + goto yyrule2 + case c == '?': + goto yystate6 + } + +yystate6: + c = l.Next() + yyrule = 3 + l.Mark() + switch { + default: + goto yyrule3 + case c == '=': + goto yystate7 + case c == 'p': + goto yystate8 + } + +yystate7: + c = l.Next() + yyrule = 4 + l.Mark() + goto yyrule4 + +yystate8: + c = l.Next() + switch { + default: + goto yyabort + case c == 'h': + goto yystate9 + } + +yystate9: + c = l.Next() + switch { + default: + goto yyabort + case c == 'p': + goto yystate10 + } + +yystate10: + c = l.Next() + yyrule = 3 + l.Mark() + goto yyrule3 + + goto yystate11 // silence unused label error +yystate11: + c = l.Next() +yystart11: + switch { + default: + goto yyabort + case c == '"': + goto yystate15 + case c == '$': + goto yystate24 + case c == '?': + goto yystate30 + case c == '\'': + goto yystate26 + case c == '\n': + goto yystate14 + case c == '\t' || c == '\r' || c == ' ': + goto yystate13 + case c >= '\x01' && c <= '\b' || c == '\v' || c == '\f' || c >= '\x0e' && c <= '\x1f' || c == '!' || c == '#' || c == '%' || c == '&' || c >= '(' && c <= '>' || c >= '@' && c <= 'ÿ': + goto yystate12 + } + +yystate12: + c = l.Next() + yyrule = 10 + l.Mark() + goto yyrule10 + +yystate13: + c = l.Next() + yyrule = 5 + l.Mark() + switch { + default: + goto yyrule5 + case c == '\t' || c == '\n' || c == '\r' || c == ' ': + goto yystate14 + } + +yystate14: + c = l.Next() + yyrule = 5 + l.Mark() + switch { + default: + goto yyrule5 + case c == '\t' || c == '\n' || c == '\r' || c == ' ': + goto yystate14 + } + +yystate15: + c = l.Next() + yyrule = 9 + l.Mark() + switch { + default: + goto yyrule9 + case c == '"': + goto yystate17 + case c == '$': + goto yystate18 + case c == '\\': + goto yystate19 + case c == '{': + goto yystate20 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate16 + } + +yystate16: + c = l.Next() + switch { + default: + goto yyabort + case c == '"': + goto yystate17 + case c == '$': + goto yystate18 + case c == '\\': + goto yystate19 + case c == '{': + goto yystate20 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate16 + } + +yystate17: + c = l.Next() + yyrule = 7 + l.Mark() + goto yyrule7 + +yystate18: + c = l.Next() + switch { + default: + goto yyabort + case c == '"': + goto yystate17 + case c == '\\': + goto yystate19 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '@' || c == '[' || c == ']' || c == '^' || c == '`' || c >= '|' && c <= '~': + goto yystate16 + } + +yystate19: + c = l.Next() + switch { + default: + goto yyabort + case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ': + goto yystate16 + } + +yystate20: + c = l.Next() + switch { + default: + goto yyabort + case c == '"': + goto yystate17 + case c == '\\': + goto yystate21 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'ÿ': + goto yystate16 + } + +yystate21: + c = l.Next() + switch { + default: + goto yyabort + case c == '"': + goto yystate22 + case c == '\\': + goto yystate21 + case c == '{': + goto yystate23 + case c >= '\x01' && c <= '!' || c >= '#' && c <= '[' || c >= ']' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate16 + } + +yystate22: + c = l.Next() + yyrule = 7 + l.Mark() + switch { + default: + goto yyrule7 + case c == '"': + goto yystate17 + case c == '$': + goto yystate18 + case c == '\\': + goto yystate19 + case c == '{': + goto yystate20 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate16 + } + +yystate23: + c = l.Next() + switch { + default: + goto yyabort + case c == '"': + goto yystate17 + case c == '$': + goto yystate18 + case c == '\\': + goto yystate21 + case c == '{': + goto yystate23 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate16 + } + +yystate24: + c = l.Next() + yyrule = 10 + l.Mark() + switch { + default: + goto yyrule10 + case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ': + goto yystate25 + } + +yystate25: + c = l.Next() + yyrule = 16 + l.Mark() + switch { + default: + goto yyrule16 + case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ': + goto yystate25 + } + +yystate26: + c = l.Next() + yyrule = 10 + l.Mark() + switch { + default: + goto yyrule10 + case c == '\'': + goto yystate28 + case c == '\\': + goto yystate29 + case c >= '\x01' && c <= '&' || c >= '(' && c <= '[' || c >= ']' && c <= 'ÿ': + goto yystate27 + } + +yystate27: + c = l.Next() + switch { + default: + goto yyabort + case c == '\'': + goto yystate28 + case c == '\\': + goto yystate29 + case c >= '\x01' && c <= '&' || c >= '(' && c <= '[' || c >= ']' && c <= 'ÿ': + goto yystate27 + } + +yystate28: + c = l.Next() + yyrule = 8 + l.Mark() + goto yyrule8 + +yystate29: + c = l.Next() + switch { + default: + goto yyabort + case c == '\'': + goto yystate27 + } + +yystate30: + c = l.Next() + yyrule = 10 + l.Mark() + switch { + default: + goto yyrule10 + case c == '>': + goto yystate31 + } + +yystate31: + c = l.Next() + yyrule = 6 + l.Mark() + goto yyrule6 + + goto yystate32 // silence unused label error +yystate32: + c = l.Next() +yystart32: + switch { + default: + goto yyabort + case c == '"': + goto yystate40 + case c == '$': + goto yystate41 + case c == '\\': + goto yystate36 + case c == '{': + goto yystate44 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate33 + } + +yystate33: + c = l.Next() + yyrule = 15 + l.Mark() + switch { + default: + goto yyrule15 + case c == '"': + goto yystate34 + case c == '$': + goto yystate35 + case c == '\\': + goto yystate36 + case c == '{': + goto yystate37 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate33 + } + +yystate34: + c = l.Next() + yyrule = 15 + l.Mark() + goto yyrule15 + +yystate35: + c = l.Next() + switch { + default: + goto yyabort + case c == '"': + goto yystate34 + case c == '\\': + goto yystate36 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '@' || c == '[' || c == ']' || c == '^' || c == '`' || c >= '|' && c <= '~': + goto yystate33 + } + +yystate36: + c = l.Next() + switch { + default: + goto yyabort + case c >= '\x01' && c <= '\t' || c >= '\v' && c <= 'ÿ': + goto yystate33 + } + +yystate37: + c = l.Next() + switch { + default: + goto yyabort + case c == '"': + goto yystate34 + case c == '\\': + goto yystate38 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'ÿ': + goto yystate33 + } + +yystate38: + c = l.Next() + yyrule = 15 + l.Mark() + switch { + default: + goto yyrule15 + case c == '\\': + goto yystate38 + case c == '{': + goto yystate39 + case c >= '\x01' && c <= '[' || c >= ']' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate33 + } + +yystate39: + c = l.Next() + yyrule = 15 + l.Mark() + switch { + default: + goto yyrule15 + case c == '"': + goto yystate34 + case c == '$': + goto yystate35 + case c == '\\': + goto yystate38 + case c == '{': + goto yystate39 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'z' || c >= '|' && c <= 'ÿ': + goto yystate33 + } + +yystate40: + c = l.Next() + yyrule = 11 + l.Mark() + goto yyrule11 + +yystate41: + c = l.Next() + yyrule = 14 + l.Mark() + switch { + default: + goto yyrule14 + case c == '\\': + goto yystate36 + case c == '{': + goto yystate43 + case c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ': + goto yystate42 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '@' || c == '[' || c == ']' || c == '^' || c == '`' || c >= '|' && c <= '~': + goto yystate33 + } + +yystate42: + c = l.Next() + yyrule = 14 + l.Mark() + switch { + default: + goto yyrule14 + case c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ': + goto yystate42 + } + +yystate43: + c = l.Next() + yyrule = 13 + l.Mark() + goto yyrule13 + +yystate44: + c = l.Next() + yyrule = 12 + l.Mark() + switch { + default: + goto yyrule12 + case c == '\\': + goto yystate38 + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= '[' || c >= ']' && c <= 'ÿ': + goto yystate33 + } + +yyrule1: // [ \t\n\r]+ + + goto yystate0 +yyrule2: // . + + goto yystate0 +yyrule3: // \<\?|\<\?php + { + fmt.Println("T_OPEN_TAG") + begin(PHP) + goto yystate0 + } +yyrule4: // \<\?= + { + fmt.Println("T_OPEN_TAG_WITH_ECHO") + begin(PHP) + goto yystate0 + } +yyrule5: // [ \t\n\r]+ + { + fmt.Println("T_WHITESPACE") + goto yystate0 + } +yyrule6: // \?\> + { + fmt.Println("T_CLOSE_TAG") + begin(INITIAL) + goto yystate0 + } +yyrule7: // [\"]{STR}*[\{\$]?[\"] + { + fmt.Printf("T_CONSTANT_ENCAPSED_STRING: %s\n", l.TokenBytes(nil)) + goto yystate0 + } +yyrule8: // [\']([^\\\']*([\\][\'])*)*[\'] + { + fmt.Printf("T_CONSTANT_ENCAPSED_STRING: %s\n", l.TokenBytes(nil)) + goto yystate0 + } +yyrule9: // [\"] + { + fmt.Println("\"") + begin(STRING) + goto yystate0 + } +yyrule10: // . + { + fmt.Printf("other: %q\n", l.TokenBytes(nil)) + goto yystate0 + } +yyrule11: // \" + { + fmt.Println("\"") + begin(PHP) + goto yystate0 + } +yyrule12: // \{ + { + fmt.Printf("T_CURLY_OPEN: %q\n", l.TokenBytes(nil)) + goto yystate0 + } +yyrule13: // \$\{ + { + fmt.Printf("T_DOLLAR_OPEN_CURLY_BRACES: %q\n", l.TokenBytes(nil)) + goto yystate0 + } +yyrule14: // \${VAR}? + { + fmt.Printf("T_VARIABLE: %q\n", l.TokenBytes(nil)) + goto yystate0 + } +yyrule15: // {TPL} + { + fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %q\n", l.unget('"')) + goto yystate0 + } +yyrule16: // \${VAR} + { + fmt.Println("T_VARIABLE") + goto yystate0 + } + panic("unreachable") + + goto yyabort // silence unused label error + +yyabort: // no lexem recognized + if c, ok := l.Abort(); ok { + return int(c) + } + goto yyAction +} + +func main() { + l := newLexer(os.Stdin, os.Stdout, "file.name") + l.Lex() +} diff --git a/php-parser.l b/php-parser.l new file mode 100644 index 0000000..5bd6cfd --- /dev/null +++ b/php-parser.l @@ -0,0 +1,144 @@ +%{ +// Copyright (c) 2011 CZ.NIC z.s.p.o. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// blame: jnml, labs.nic.cz + +package main + +import ( + "bufio" + "go/token" + "io" + "unicode" + "fmt" + "os" + + "github.com/cznic/golex/lex" +) + +// Allocate Character classes anywhere in [0x80, 0xFF]. +const ( + classUnicodeLeter = iota + 0x80 + classUnicodeDigit + classOther +) + +var sc int + +const ( + INITIAL = iota + PHP + STRING +) + +type lexer struct { + *lex.Lexer +} + +func begin(cond int) { + sc = cond +} + +func rune2Class(r rune) int { + if r >= 0 && r < 0x80 { // Keep ASCII as it is. + return int(r) + } + if unicode.IsLetter(r) { + return classUnicodeLeter + } + if unicode.IsDigit(r) { + return classUnicodeDigit + } + return classOther +} + +func newLexer(src io.Reader, dst io.Writer, fName string) *lexer { + file := token.NewFileSet().AddFile(fName, -1, 1<<31-1) + lx, err := lex.New(file, bufio.NewReader(src), lex.RuneClass(rune2Class)) + if (err != nil) { panic(err) } + return &lexer{lx} +} + +type yySymType struct {} + +func (l *lexer) unget(r rune) []byte{ + l.Unget(l.Lookahead()) + + chars := l.Token(); + lastChar := chars[len(chars)-1]; + + if lastChar.Rune != r { + return l.TokenBytes(nil) + } + + l.Unget(lastChar); + + buf := l.TokenBytes(nil) + buf = buf[:len(buf)-1] + + return buf +} + +func (l *lexer) Lex() int { // Lex(lval *yySymType) + c := l.Enter() + +%} + +%s PHP STRING + +%yyb last == '\n' || last = '\0' +%yyt sc +%yyc c +%yyn c = l.Next() +%yym l.Mark() + +D [0-9]+ +NC ([^\\\$\"\{]) +NCH [^a-zA-Z_\x7f-\xff] +ENSCAPED ([\\].) +DOLLAR_E ([\$]{ENSCAPED}) +DOLLAR_N ([\$][^a-zA-Z_\x7f-\xff\\\$\"\{]) +CURVE_E ([\{]{ENSCAPED}) +CURVE ([\{][^\$\"]) +ALLOWED ({NC}|{ENSCAPED}|{DOLLAR_E}|{DOLLAR_N}|{CURVE_E}|{CURVE}) +STR_END ([\{\$]?[\"])? +STR {ALLOWED}+{ALLOWED}* +TPL {STR}{STR_END} +VAR [a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]* + +%% + c = l.Rule0() + // ([\$]{NCH})* + +[ \t\n\r]+ +. +\<\?|\<\?php fmt.Println("T_OPEN_TAG");begin(PHP) +\<\?= fmt.Println("T_OPEN_TAG_WITH_ECHO");begin(PHP) + +[ \t\n\r]+ fmt.Println("T_WHITESPACE") +\?\> fmt.Println("T_CLOSE_TAG");begin(INITIAL) + +[\"]{STR}*[\{\$]?[\"] fmt.Printf("T_CONSTANT_ENCAPSED_STRING: %s\n", l.TokenBytes(nil)); +[\']([^\\\']*([\\][\'])*)*[\'] fmt.Printf("T_CONSTANT_ENCAPSED_STRING: %s\n", l.TokenBytes(nil)); +[\"] fmt.Println("\"");begin(STRING) +. fmt.Printf("other: %q\n", l.TokenBytes(nil)) + +\" fmt.Println("\""); begin(PHP) +\{ fmt.Printf("T_CURLY_OPEN: %q\n", l.TokenBytes(nil)); +\$\{ fmt.Printf("T_DOLLAR_OPEN_CURLY_BRACES: %q\n", l.TokenBytes(nil)) +\${VAR}? fmt.Printf("T_VARIABLE: %q\n", l.TokenBytes(nil)) +{TPL} fmt.Printf("T_ENCAPSED_AND_WHITESPACE: %q\n", l.unget('"')); + +\${VAR} fmt.Println("T_VARIABLE") + +%% + if c, ok := l.Abort(); ok { return int(c) } + goto yyAction +} + +func main() { + l := newLexer(os.Stdin, os.Stdout, "file.name") + l.Lex(); +} \ No newline at end of file