From c23b899f558f1cb787f7fe2ce4ab4ab2a9431181 Mon Sep 17 00:00:00 2001 From: z7zmey Date: Sun, 3 Dec 2017 09:17:05 +0200 Subject: [PATCH] handle new lines --- example.php | 38 +++++++++++++++++-------- lexer.go | 39 +++++++++++++++++++++++++- scanner.go | 53 +++++++++++++++++------------------ scanner.l | 81 ++++++++++++++++++++++++++++------------------------- 4 files changed, 133 insertions(+), 78 deletions(-) diff --git a/example.php b/example.php index ac5f3bb..2c2c9cb 100644 --- a/example.php +++ b/example.php @@ -1,26 +1,40 @@ \ No newline at end of file diff --git a/lexer.go b/lexer.go index 3d36953..8bbbf90 100644 --- a/lexer.go +++ b/lexer.go @@ -4,6 +4,7 @@ import ( "bufio" "go/token" "io" + "unicode" "github.com/cznic/golex/lex" ) @@ -18,6 +19,21 @@ const ( type lexer struct { *lex.Lexer stateStack []int + lineNumber int +} + +func rune2Class(r rune) int { + if r >= 0 && r < 0x80 { // Keep ASCII as it is. + return int(r) + } + if unicode.IsLetter(r) { + return classUnicodeLeter + } + if unicode.IsDigit(r) { + return classUnicodeDigit + } + // return classOther + return -1 } func newLexer(src io.Reader, fName string) *lexer { @@ -26,7 +42,7 @@ func newLexer(src io.Reader, fName string) *lexer { if err != nil { panic(err) } - return &lexer{lx, []int{0}} + return &lexer{lx, []int{0}, 1} } func (l *lexer) ungetN(n int) []byte { @@ -67,3 +83,24 @@ func (l *lexer) begin(state int) { func (l *lexer) getCurrentState() int { return l.stateStack[len(l.stateStack)-1] } + +func (l *lexer) handleNewLine(str []byte) (int, int) { + startln := l.lineNumber + + var prev byte + + for _, b := range str { + if b == '\n' || prev == '\r' { + l.lineNumber++ + } + + prev = b + } + + // handle last \r + if prev == '\r' { + l.lineNumber++ + } + + return startln, l.lineNumber +} diff --git a/scanner.go b/scanner.go index f91e40f..7b4d0c1 100644 --- a/scanner.go +++ b/scanner.go @@ -11,7 +11,6 @@ package main import ( "bytes" "fmt" - "unicode" ) const ( @@ -30,20 +29,6 @@ const ( var heredocLabel []byte -func rune2Class(r rune) int { - if r >= 0 && r < 0x80 { // Keep ASCII as it is. - return int(r) - } - if unicode.IsLetter(r) { - return classUnicodeLeter - } - if unicode.IsDigit(r) { - return classUnicodeDigit - } - // return classOther - return -1 -} - func (l *lexer) Lex(lval *yySymType) int { // Lex(lval *yySymType) c := l.Enter() @@ -7066,7 +7051,7 @@ yystart554: goto yystate557 case c == '{': goto yystate559 - case c >= '\x01' && c <= '\t' || c >= '\v' && c <= '!' || c == '#' || c >= '%' && c <= 'z' || c >= '|' && c <= 'ÿ': + case c >= '\x01' && c <= '!' || c == '#' || c >= '%' && c <= 'z' || c >= '|' && c <= 'ÿ': goto yystate555 } @@ -7675,8 +7660,10 @@ yystate614: } yyrule1: // [ \t\n\r]+ - - goto yystate0 + { + l.handleNewLine(l.TokenBytes(nil)) + goto yystate0 + } yyrule2: // . { @@ -7697,12 +7684,13 @@ yyrule2: // . } c = l.Next() } + l.handleNewLine(tb) lval.token = string(tb) return T_INLINE_HTML - goto yystate0 } yyrule3: // \<\?php([ \t]|{NEW_LINE}) { + l.handleNewLine(l.TokenBytes(nil)) l.begin(PHP) //lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; goto yystate0 } @@ -7720,11 +7708,12 @@ yyrule5: // \<\?= } yyrule6: // [ \t\n\r]+ { - //lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE + l.handleNewLine(l.TokenBytes(nil)) //lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE goto yystate0 } yyrule7: // \?\>{NEW_LINE}? { + l.handleNewLine(l.TokenBytes(nil)) l.begin(INITIAL) lval.token = ";" return rune2Class(';') @@ -8133,6 +8122,7 @@ yyrule67: // while } yyrule68: // yield[ \t\n\r]+from[^a-zA-Z0-9_\x80-\xff] { + l.handleNewLine(l.TokenBytes(nil)) lval.token = string(l.TokenBytes(nil)) return T_YIELD_FROM goto yystate0 @@ -8475,21 +8465,22 @@ yyrule124: // \?\? } yyrule125: // (#|[/][/]).*{NEW_LINE} { - // lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> + l.handleNewLine(l.TokenBytes(nil)) // lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> goto yystate0 } yyrule126: // [/][*][^*]*[*]+([^*/][^*]*[*]+)*[/] { - // lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> + l.handleNewLine(l.TokenBytes(nil)) // lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> goto yystate0 } yyrule127: // [/][*][*][^*]*[*]+([^*/][^*]*[*]+)*[/] { - // lval.token = string(l.TokenBytes(nil)); return T_DOC_COMMENT; // TODO: handle ?> + l.handleNewLine(l.TokenBytes(nil)) // lval.token = string(l.TokenBytes(nil)); return T_DOC_COMMENT; // TODO: handle ?> goto yystate0 } yyrule128: // '[^']*(\\')*' { + l.handleNewLine(l.TokenBytes(nil)) lval.token = string(l.TokenBytes(nil)) return T_CONSTANT_ENCAPSED_STRING goto yystate0 @@ -8535,6 +8526,7 @@ yyrule134: // -> } yyrule135: // [ \t\n\r]+ { + l.handleNewLine(l.TokenBytes(nil)) lval.token = string(l.TokenBytes(nil)) return T_WHITESPACE goto yystate0 @@ -8631,9 +8623,9 @@ yyrule142: // [b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["])) } } l.ungetN(ungetCnt) + l.handleNewLine(tb) lval.token = string(tb) return T_START_HEREDOC - goto yystate0 } yyrule143: // . { @@ -8658,9 +8650,9 @@ yyrule143: // . } c = l.Next() } + l.handleNewLine(tb) lval.token = string(tb) return T_ENCAPSED_AND_WHITESPACE - goto yystate0 } yyrule144: // {VAR_NAME}\; { @@ -8746,7 +8738,7 @@ yyrule149: // \$ l.pushState(STRING_VAR) goto yystate0 } -yyrule150: // . +yyrule150: // .|[ \t\n\r] { F1: @@ -8756,6 +8748,7 @@ yyrule150: // . } switch c { case '"': + l.handleNewLine(l.TokenBytes(nil)) lval.token = string(l.TokenBytes(nil)) return T_ENCAPSED_AND_WHITESPACE break F1 @@ -8765,6 +8758,7 @@ yyrule150: // . if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { l.ungetN(1) tb := l.TokenBytes(nil) + l.handleNewLine(tb[:len(tb)-1]) lval.token = string(tb[:len(tb)-1]) return T_ENCAPSED_AND_WHITESPACE break F1 @@ -8776,6 +8770,7 @@ yyrule150: // . if rune(c) == '$' { l.ungetN(1) tb := l.TokenBytes(nil) + l.handleNewLine(tb[:len(tb)-1]) lval.token = string(tb[:len(tb)-1]) return T_ENCAPSED_AND_WHITESPACE break F1 @@ -8798,6 +8793,7 @@ yyrule151: // . } switch c { case '`': + l.handleNewLine(l.TokenBytes(nil)) lval.token = string(l.TokenBytes(nil)) return T_ENCAPSED_AND_WHITESPACE break F2 @@ -8807,6 +8803,7 @@ yyrule151: // . if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { l.ungetN(1) tb := l.TokenBytes(nil) + l.handleNewLine(tb[:len(tb)-1]) lval.token = string(tb[:len(tb)-1]) return T_ENCAPSED_AND_WHITESPACE break F2 @@ -8818,6 +8815,7 @@ yyrule151: // . if rune(c) == '$' { l.ungetN(1) tb := l.TokenBytes(nil) + l.handleNewLine(tb[:len(tb)-1]) lval.token = string(tb[:len(tb)-1]) return T_ENCAPSED_AND_WHITESPACE break F2 @@ -8880,9 +8878,9 @@ yyrule152: // .|[ \t\n\r] c = l.Next() } + l.handleNewLine(tb) lval.token = string(tb) return T_ENCAPSED_AND_WHITESPACE - goto yystate0 } yyrule153: // \${VAR_NAME} { @@ -8944,6 +8942,7 @@ yyrule161: // \] } yyrule162: // [ \n\r\t\\'#] { + l.handleNewLine(l.TokenBytes(nil)) l.popState() l.popState() lval.token = string(l.TokenBytes(nil)) diff --git a/scanner.l b/scanner.l index e51f1e2..7b48e63 100644 --- a/scanner.l +++ b/scanner.l @@ -10,7 +10,6 @@ package main import ( "fmt" "bytes" - "unicode" ) const ( @@ -29,20 +28,6 @@ const ( var heredocLabel []byte -func rune2Class(r rune) int { - if r >= 0 && r < 0x80 { // Keep ASCII as it is. - return int(r) - } - if unicode.IsLetter(r) { - return classUnicodeLeter - } - if unicode.IsDigit(r) { - return classUnicodeDigit - } - // return classOther - return -1 -} - func (l *lexer) Lex(lval *yySymType) int { // Lex(lval *yySymType) c := l.Enter() @@ -68,7 +53,7 @@ NEW_LINE (\r|\n|\r\n) %% c = l.Rule0() -[ \t\n\r]+ +[ \t\n\r]+ l.handleNewLine(l.TokenBytes(nil)); . tb := []byte{} @@ -92,14 +77,16 @@ NEW_LINE (\r|\n|\r\n) c = l.Next() } - lval.token = string(tb); return T_INLINE_HTML + l.handleNewLine(tb); + lval.token = string(tb); + return T_INLINE_HTML -\<\?php([ \t]|{NEW_LINE}) l.begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; +\<\?php([ \t]|{NEW_LINE}) l.handleNewLine(l.TokenBytes(nil));l.begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; \<\? l.begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG; \<\?= l.begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_ECHO; -[ \t\n\r]+ //lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE -\?\>{NEW_LINE}? l.begin(INITIAL);lval.token = ";"; return rune2Class(';'); +[ \t\n\r]+ l.handleNewLine(l.TokenBytes(nil));//lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE +\?\>{NEW_LINE}? l.handleNewLine(l.TokenBytes(nil));l.begin(INITIAL);lval.token = ";"; return rune2Class(';'); {DNUM}|{EXPONENT_DNUM} lval.token = string(l.TokenBytes(nil)); return T_DNUMBER {BNUM} @@ -194,7 +181,7 @@ NEW_LINE (\r|\n|\r\n) use lval.token = string(l.TokenBytes(nil)); return T_USE var lval.token = string(l.TokenBytes(nil)); return T_VAR while lval.token = string(l.TokenBytes(nil)); return T_WHILE -yield[ \t\n\r]+from[^a-zA-Z0-9_\x80-\xff] lval.token = string(l.TokenBytes(nil)); return T_YIELD_FROM +yield[ \t\n\r]+from[^a-zA-Z0-9_\x80-\xff] l.handleNewLine(l.TokenBytes(nil));lval.token = string(l.TokenBytes(nil)); return T_YIELD_FROM yield lval.token = string(l.TokenBytes(nil)); return T_YIELD include lval.token = string(l.TokenBytes(nil)); return T_INCLUDE include_once lval.token = string(l.TokenBytes(nil)); return T_INCLUDE_ONCE @@ -251,10 +238,10 @@ NEW_LINE (\r|\n|\r\n) \<\< lval.token = string(l.TokenBytes(nil)); return T_SL \>\> lval.token = string(l.TokenBytes(nil)); return T_SR \?\? lval.token = string(l.TokenBytes(nil)); return T_COALESCE -(#|[/][/]).*{NEW_LINE} // lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> -[/][*][^*]*[*]+([^*/][^*]*[*]+)*[/] // lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> -[/][*][*][^*]*[*]+([^*/][^*]*[*]+)*[/] // lval.token = string(l.TokenBytes(nil)); return T_DOC_COMMENT; // TODO: handle ?> -'[^']*(\\')*' lval.token = string(l.TokenBytes(nil)); return T_CONSTANT_ENCAPSED_STRING +(#|[/][/]).*{NEW_LINE} l.handleNewLine(l.TokenBytes(nil));// lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> +[/][*][^*]*[*]+([^*/][^*]*[*]+)*[/] l.handleNewLine(l.TokenBytes(nil));// lval.token = string(l.TokenBytes(nil)); return T_COMMENT; // TODO: handle ?> +[/][*][*][^*]*[*]+([^*/][^*]*[*]+)*[/] l.handleNewLine(l.TokenBytes(nil));// lval.token = string(l.TokenBytes(nil)); return T_DOC_COMMENT; // TODO: handle ?> +'[^']*(\\')*' l.handleNewLine(l.TokenBytes(nil));lval.token = string(l.TokenBytes(nil)); return T_CONSTANT_ENCAPSED_STRING {OPERATORS} lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) \{ l.pushState(PHP); lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) @@ -263,7 +250,7 @@ NEW_LINE (\r|\n|\r\n) {VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_STRING -> l.begin(PROPERTY);lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR; -[ \t\n\r]+ lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE; +[ \t\n\r]+ l.handleNewLine(l.TokenBytes(nil));lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE; -> lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR; {VAR_NAME} l.begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_STRING; . l.ungetN(1);l.begin(PHP) @@ -331,7 +318,9 @@ NEW_LINE (\r|\n|\r\n) l.ungetN(ungetCnt) - lval.token = string(tb); return T_START_HEREDOC + l.handleNewLine(tb); + lval.token = string(tb); + return T_START_HEREDOC . searchLabel := []byte{} @@ -357,7 +346,9 @@ NEW_LINE (\r|\n|\r\n) c = l.Next() } - lval.token = string(tb); return T_ENCAPSED_AND_WHITESPACE + l.handleNewLine(tb); + lval.token = string(tb); + return T_ENCAPSED_AND_WHITESPACE {VAR_NAME}\; l.begin(PHP);lval.token = string(l.ungetN(1)); return T_END_HEREDOC @@ -412,7 +403,7 @@ NEW_LINE (\r|\n|\r\n) \{\$ lval.token = string(l.ungetN(1)); l.pushState(PHP); return T_CURLY_OPEN \$\{ l.pushState(STRING_VAR_NAME);lval.token = string(l.TokenBytes(nil)); return T_DOLLAR_OPEN_CURLY_BRACES \$ l.ungetN(1);l.pushState(STRING_VAR) -. +.|[ \t\n\r] F1:for { if c == -1 { break; @@ -420,7 +411,9 @@ NEW_LINE (\r|\n|\r\n) switch c { case '"' : - lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE + l.handleNewLine(l.TokenBytes(nil)); + lval.token = string(l.TokenBytes(nil)); + return T_ENCAPSED_AND_WHITESPACE break F1; case '$': @@ -428,7 +421,9 @@ NEW_LINE (\r|\n|\r\n) if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { l.ungetN(1) tb := l.TokenBytes(nil) - lval.token = string(tb[:len(tb)-1]); return T_ENCAPSED_AND_WHITESPACE + l.handleNewLine(tb[:len(tb)-1]); + lval.token = string(tb[:len(tb)-1]); + return T_ENCAPSED_AND_WHITESPACE break F1; } l.ungetN(0) @@ -438,7 +433,9 @@ NEW_LINE (\r|\n|\r\n) if rune(c) == '$' { l.ungetN(1) tb := l.TokenBytes(nil) - lval.token = string(tb[:len(tb)-1]); return T_ENCAPSED_AND_WHITESPACE + l.handleNewLine(tb[:len(tb)-1]); + lval.token = string(tb[:len(tb)-1]); + return T_ENCAPSED_AND_WHITESPACE break F1; } l.ungetN(0) @@ -457,8 +454,10 @@ NEW_LINE (\r|\n|\r\n) } switch c { - case '`' : - lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE + case '`' : + l.handleNewLine(l.TokenBytes(nil)); + lval.token = string(l.TokenBytes(nil)); + return T_ENCAPSED_AND_WHITESPACE break F2; case '$': @@ -466,7 +465,9 @@ NEW_LINE (\r|\n|\r\n) if rune(c) == '{' || c >= 'A' && c <= 'Z' || c == '_' || c >= 'a' && c <= 'z' || c >= '\u007f' && c <= 'ÿ' { l.ungetN(1) tb := l.TokenBytes(nil) - lval.token = string(tb[:len(tb)-1]); return T_ENCAPSED_AND_WHITESPACE + l.handleNewLine(tb[:len(tb)-1]); + lval.token = string(tb[:len(tb)-1]); + return T_ENCAPSED_AND_WHITESPACE break F2; } l.ungetN(0) @@ -476,7 +477,9 @@ NEW_LINE (\r|\n|\r\n) if rune(c) == '$' { l.ungetN(1) tb := l.TokenBytes(nil) - lval.token = string(tb[:len(tb)-1]); return T_ENCAPSED_AND_WHITESPACE + l.handleNewLine(tb[:len(tb)-1]); + lval.token = string(tb[:len(tb)-1]); + return T_ENCAPSED_AND_WHITESPACE break F2; } l.ungetN(0) @@ -537,7 +540,9 @@ NEW_LINE (\r|\n|\r\n) c = l.Next() } - lval.token = string(tb); return T_ENCAPSED_AND_WHITESPACE + l.handleNewLine(tb); + lval.token = string(tb); + return T_ENCAPSED_AND_WHITESPACE \${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE ->{VAR_NAME} lval.token = string(l.ungetN(len(l.TokenBytes(nil))-2)); return T_OBJECT_OPERATOR @@ -549,7 +554,7 @@ NEW_LINE (\r|\n|\r\n) \${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE {VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_STRING \] l.popState(); l.popState();lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) -[ \n\r\t\\'#] l.popState(); l.popState();lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE +[ \n\r\t\\'#] l.handleNewLine(l.TokenBytes(nil));l.popState(); l.popState();lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE {OPERATORS} lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0])) . lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))