refactor lexer state

This commit is contained in:
vadim 2017-12-01 16:04:53 +02:00
parent bc6d25a7cc
commit 610a935929
4 changed files with 121 additions and 139 deletions

View File

@ -4,7 +4,6 @@ import (
"bufio"
"go/token"
"io"
"unicode"
"github.com/cznic/golex/lex"
)
@ -16,50 +15,9 @@ const (
classOther
)
var sc int
type lexer struct {
*lex.Lexer
}
var stateStack = []int{PHP}
var heredocLabel []byte
func pushState(state int) {
sc = state
stateStack = append(stateStack, state)
}
func popState() {
len := len(stateStack)
if len <= 1 {
return
}
sc = stateStack[len-2]
stateStack = stateStack[:len-1]
}
func begin(state int) {
len := len(stateStack)
stateStack = stateStack[:len-1]
stateStack = append(stateStack, state)
sc = state
}
func rune2Class(r rune) int {
if r >= 0 && r < 0x80 { // Keep ASCII as it is.
return int(r)
}
if unicode.IsLetter(r) {
return classUnicodeLeter
}
if unicode.IsDigit(r) {
return classUnicodeDigit
}
// return classOther
return -1
stateStack []int
}
func newLexer(src io.Reader, dst io.Writer, fName string) *lexer {
@ -68,25 +26,7 @@ func newLexer(src io.Reader, dst io.Writer, fName string) *lexer {
if err != nil {
panic(err)
}
return &lexer{lx}
}
func (l *lexer) unget(r rune) []byte {
l.Unget(l.Lookahead())
chars := l.Token()
lastChar := chars[len(chars)-1]
if lastChar.Rune != r {
return l.TokenBytes(nil)
}
l.Unget(lastChar)
buf := l.TokenBytes(nil)
buf = buf[:len(buf)-1]
return buf
return &lexer{lx, []int{0}}
}
func (l *lexer) ungetN(n int) []byte {
@ -104,3 +44,26 @@ func (l *lexer) ungetN(n int) []byte {
return buf
}
func (l *lexer) pushState(state int) {
l.stateStack = append(l.stateStack, state)
}
func (l *lexer) popState() {
len := len(l.stateStack)
if len <= 1 {
return
}
l.stateStack = l.stateStack[:len-1]
}
func (l *lexer) begin(state int) {
len := len(l.stateStack)
l.stateStack = l.stateStack[:len-1]
l.stateStack = append(l.stateStack, state)
}
func (l *lexer) getCurrentState() int {
return l.stateStack[len(l.stateStack)-1]
}

15
main.go
View File

@ -3,6 +3,7 @@ package main
import (
"bytes"
"os"
"unicode"
)
const src = `
@ -24,3 +25,17 @@ func main() {
l := newLexer(bytes.NewBufferString(src), os.Stdout, "file.name")
yyParse(l)
}
func rune2Class(r rune) int {
if r >= 0 && r < 0x80 { // Keep ASCII as it is.
return int(r)
}
if unicode.IsLetter(r) {
return classUnicodeLeter
}
if unicode.IsDigit(r) {
return classUnicodeDigit
}
// return classOther
return -1
}

View File

@ -27,6 +27,8 @@ const (
BACKQUOTE
)
var heredocLabel []byte
func (l *lexer) Lex(lval *yySymType) int { // Lex(lval *yySymType)
c := l.Enter()
@ -35,7 +37,7 @@ yystate0:
_ = yyrule
c = l.Rule0()
switch yyt := sc; yyt {
switch yyt := l.getCurrentState(); yyt {
default:
panic(fmt.Errorf(`invalid start condition %d`, yyt))
case 0: // start condition: INITIAL
@ -7680,17 +7682,17 @@ yyrule2: // .
}
yyrule3: // \<\?php([ \t]|{NEW_LINE})
{
begin(PHP) //lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG;
l.begin(PHP) //lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG;
goto yystate0
}
yyrule4: // \<\?
{
begin(PHP) //lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG;
l.begin(PHP) //lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG;
goto yystate0
}
yyrule5: // \<\?=
{
begin(PHP)
l.begin(PHP)
lval.token = string(l.TokenBytes(nil))
return T_OPEN_TAG_WITH_ECHO
goto yystate0
@ -7702,7 +7704,7 @@ yyrule6: // [ \t\n\r]+
}
yyrule7: // \?\>{NEW_LINE}?
{
begin(INITIAL) //lval.token = string(l.TokenBytes(nil)); return T_CLOSE_TAG;
l.begin(INITIAL) //lval.token = string(l.TokenBytes(nil)); return T_CLOSE_TAG;
goto yystate0
}
yyrule8: // {DNUM}|{EXPONENT_DNUM}
@ -8477,14 +8479,14 @@ yyrule129: // {OPERATORS}
}
yyrule130: // \{
{
pushState(PHP)
l.pushState(PHP)
lval.token = string(l.TokenBytes(nil))
return rune2Class(rune(l.TokenBytes(nil)[0]))
goto yystate0
}
yyrule131: // \}
{
popState()
l.popState()
lval.token = string(l.TokenBytes(nil))
return rune2Class(rune(l.TokenBytes(nil)[0]))
goto yystate0
@ -8503,7 +8505,7 @@ yyrule133: // {VAR_NAME}
}
yyrule134: // ->
{
begin(PROPERTY)
l.begin(PROPERTY)
lval.token = string(l.TokenBytes(nil))
return T_OBJECT_OPERATOR
goto yystate0
@ -8522,7 +8524,7 @@ yyrule136: // ->
}
yyrule137: // {VAR_NAME}
{
begin(PHP)
l.begin(PHP)
lval.token = string(l.TokenBytes(nil))
return T_STRING
goto yystate0
@ -8530,7 +8532,7 @@ yyrule137: // {VAR_NAME}
yyrule138: // .
{
l.ungetN(1)
begin(PHP)
l.begin(PHP)
goto yystate0
}
yyrule139: // [\']([^\\\']*([\\][\'])*)*[\']
@ -8541,14 +8543,14 @@ yyrule139: // [\']([^\\\']*([\\][\'])*)*[\']
}
yyrule140: // `
{
begin(BACKQUOTE)
l.begin(BACKQUOTE)
lval.token = string(l.TokenBytes(nil))
rune2Class(rune(l.TokenBytes(nil)[0]))
goto yystate0
}
yyrule141: // `
{
begin(PHP)
l.begin(PHP)
lval.token = string(l.TokenBytes(nil))
rune2Class(rune(l.TokenBytes(nil)[0]))
goto yystate0
@ -8578,13 +8580,13 @@ yyrule142: // [b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["]))
case '\'':
lblFirst++
lblLast--
begin(NOWDOC)
l.begin(NOWDOC)
case '"':
lblFirst++
lblLast--
begin(HEREDOC)
l.begin(HEREDOC)
default:
begin(HEREDOC)
l.begin(HEREDOC)
}
heredocLabel = make([]byte, lblLast-lblFirst+1)
copy(heredocLabel, tb[lblFirst:lblLast+1])
@ -8602,7 +8604,7 @@ yyrule142: // [b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["]))
ungetCnt++
c = l.Next()
if '\n' == rune(c) || '\r' == rune(c) {
begin(HEREDOC_END)
l.begin(HEREDOC_END)
}
}
l.ungetN(ungetCnt)
@ -8622,7 +8624,7 @@ yyrule143: // .
}
if '\n' == rune(c) || '\r' == rune(c) {
if bytes.Equal(append(heredocLabel, ';'), searchLabel) {
begin(HEREDOC_END)
l.begin(HEREDOC_END)
tb = l.ungetN(len(heredocLabel) + 1)
break
}
@ -8639,7 +8641,7 @@ yyrule143: // .
}
yyrule144: // {VAR_NAME}\;
{
begin(PHP)
l.begin(PHP)
lval.token = string(l.ungetN(1))
return T_END_HEREDOC
goto yystate0
@ -8656,7 +8658,7 @@ yyrule145: // [b]?[\"]
l.ungetN(len(l.TokenBytes(nil)) - cnt)
tokenBytes := l.TokenBytes(nil)[:cnt]
pushState(STRING)
l.pushState(STRING)
lval.token = string(tokenBytes)
return rune2Class('"')
}
@ -8696,7 +8698,7 @@ yyrule145: // [b]?[\"]
}
yyrule146: // \"
{
popState()
l.popState()
lval.token = "\""
return rune2Class(rune(l.TokenBytes(nil)[0]))
goto yystate0
@ -8704,13 +8706,13 @@ yyrule146: // \"
yyrule147: // \{\$
{
lval.token = string(l.ungetN(1))
pushState(PHP)
l.pushState(PHP)
return T_CURLY_OPEN
goto yystate0
}
yyrule148: // \$\{
{
pushState(STRING_VAR_NAME)
l.pushState(STRING_VAR_NAME)
lval.token = string(l.TokenBytes(nil))
return T_DOLLAR_OPEN_CURLY_BRACES
goto yystate0
@ -8718,7 +8720,7 @@ yyrule148: // \$\{
yyrule149: // \$
{
l.ungetN(1)
pushState(STRING_VAR)
l.pushState(STRING_VAR)
goto yystate0
}
yyrule150: // .
@ -8821,7 +8823,7 @@ yyrule152: // .|[ \t\n\r]
fallthrough
case '\r':
if bytes.Equal(append(heredocLabel, ';'), searchLabel) { // TODO handle ';' as optional
begin(HEREDOC_END)
l.begin(HEREDOC_END)
tb = l.ungetN(len(heredocLabel) + 1)
break HEREDOCFOR
}
@ -8873,14 +8875,14 @@ yyrule154: // ->{VAR_NAME}
}
yyrule155: // {VAR_NAME}
{
popState()
l.popState()
lval.token = string(l.TokenBytes(nil))
return T_STRING
goto yystate0
}
yyrule156: // \[
{
pushState(STRING_VAR_INDEX)
l.pushState(STRING_VAR_INDEX)
lval.token = string(l.TokenBytes(nil))
return rune2Class(rune(l.TokenBytes(nil)[0]))
goto yystate0
@ -8888,7 +8890,7 @@ yyrule156: // \[
yyrule157: // .|[ \t\n\r]
{
l.ungetN(1)
popState()
l.popState()
goto yystate0
}
yyrule158: // {LNUM}|{HNUM}|{BNUM}
@ -8911,16 +8913,16 @@ yyrule160: // {VAR_NAME}
}
yyrule161: // \]
{
popState()
popState()
l.popState()
l.popState()
lval.token = string(l.TokenBytes(nil))
return rune2Class(rune(l.TokenBytes(nil)[0]))
goto yystate0
}
yyrule162: // [ \n\r\t\\'#]
{
popState()
popState()
l.popState()
l.popState()
lval.token = string(l.TokenBytes(nil))
return T_ENCAPSED_AND_WHITESPACE
goto yystate0
@ -8939,8 +8941,8 @@ yyrule164: // .
}
yyrule165: // {VAR_NAME}[\[\}]
{
popState()
pushState(PHP)
l.popState()
l.pushState(PHP)
lval.token = string(l.ungetN(1))
return T_STRING_VARNAME
goto yystate0
@ -8948,8 +8950,8 @@ yyrule165: // {VAR_NAME}[\[\}]
yyrule166: // .
{
l.ungetN(1)
popState()
pushState(PHP)
l.popState()
l.pushState(PHP)
goto yystate0
}
panic("unreachable")

View File

@ -26,6 +26,8 @@ const (
BACKQUOTE
)
var heredocLabel []byte
func (l *lexer) Lex(lval *yySymType) int { // Lex(lval *yySymType)
c := l.Enter()
@ -34,7 +36,7 @@ func (l *lexer) Lex(lval *yySymType) int { // Lex(lval *yySymType)
%s PHP STRING STRING_VAR STRING_VAR_INDEX STRING_VAR_NAME PROPERTY HEREDOC_END NOWDOC HEREDOC BACKQUOTE
%yyb last == '\n' || last = '\0'
%yyt sc
%yyt l.getCurrentState()
%yyc c
%yyn c = l.Next()
%yym l.Mark()
@ -77,12 +79,12 @@ NEW_LINE (\r|\n|\r\n)
lval.token = string(tb); return T_INLINE_HTML
<INITIAL>\<\?php([ \t]|{NEW_LINE}) begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG;
<INITIAL>\<\? begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG;
<INITIAL>\<\?= begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG_WITH_ECHO;
<INITIAL>\<\?php([ \t]|{NEW_LINE}) l.begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG;
<INITIAL>\<\? l.begin(PHP);//lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG;
<INITIAL>\<\?= l.begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_OPEN_TAG_WITH_ECHO;
<PHP>[ \t\n\r]+ //lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE
<PHP>\?\>{NEW_LINE}? begin(INITIAL);//lval.token = string(l.TokenBytes(nil)); return T_CLOSE_TAG;
<PHP>\?\>{NEW_LINE}? l.begin(INITIAL);//lval.token = string(l.TokenBytes(nil)); return T_CLOSE_TAG;
<PHP>{DNUM}|{EXPONENT_DNUM} lval.token = string(l.TokenBytes(nil)); return T_DNUMBER
<PHP>{BNUM}
@ -240,21 +242,21 @@ NEW_LINE (\r|\n|\r\n)
<PHP>'[^']*(\\')*' lval.token = string(l.TokenBytes(nil)); return T_CONSTANT_ENCAPSED_STRING
<PHP>{OPERATORS} lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<PHP>\{ pushState(PHP); lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<PHP>\} popState(); lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<PHP>\{ l.pushState(PHP); lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<PHP>\} l.popState(); lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<PHP>\${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE
<PHP>{VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_STRING
<PHP>-> begin(PROPERTY);lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR;
<PHP>-> l.begin(PROPERTY);lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR;
<PROPERTY>[ \t\n\r]+ lval.token = string(l.TokenBytes(nil)); return T_WHITESPACE;
<PROPERTY>-> lval.token = string(l.TokenBytes(nil)); return T_OBJECT_OPERATOR;
<PROPERTY>{VAR_NAME} begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_STRING;
<PROPERTY>. l.ungetN(1);begin(PHP)
<PROPERTY>{VAR_NAME} l.begin(PHP);lval.token = string(l.TokenBytes(nil)); return T_STRING;
<PROPERTY>. l.ungetN(1);l.begin(PHP)
<PHP>[\']([^\\\']*([\\][\'])*)*[\'] lval.token = string(l.TokenBytes(nil)); return T_CONSTANT_ENCAPSED_STRING;
<PHP>` begin(BACKQUOTE); lval.token = string(l.TokenBytes(nil)); rune2Class(rune(l.TokenBytes(nil)[0]))
<BACKQUOTE>` begin(PHP); lval.token = string(l.TokenBytes(nil)); rune2Class(rune(l.TokenBytes(nil)[0]))
<PHP>` l.begin(BACKQUOTE); lval.token = string(l.TokenBytes(nil)); rune2Class(rune(l.TokenBytes(nil)[0]))
<BACKQUOTE>` l.begin(PHP); lval.token = string(l.TokenBytes(nil)); rune2Class(rune(l.TokenBytes(nil)[0]))
<PHP>[b]?\<\<\<[ \t]*({VAR_NAME}|([']{VAR_NAME}['])|(["]{VAR_NAME}["])){NEW_LINE}
tb := l.TokenBytes(nil)
@ -282,13 +284,13 @@ NEW_LINE (\r|\n|\r\n)
case '\'' :
lblFirst++
lblLast--
begin(NOWDOC)
l.begin(NOWDOC)
case '"' :
lblFirst++
lblLast--
begin(HEREDOC)
l.begin(HEREDOC)
default:
begin(HEREDOC)
l.begin(HEREDOC)
}
heredocLabel = make([]byte, lblLast - lblFirst + 1)
@ -308,7 +310,7 @@ NEW_LINE (\r|\n|\r\n)
ungetCnt++
c = l.Next()
if '\n' == rune(c) || '\r' == rune(c) {
begin(HEREDOC_END)
l.begin(HEREDOC_END)
}
}
@ -327,7 +329,7 @@ NEW_LINE (\r|\n|\r\n)
if '\n' == rune(c) || '\r' == rune(c) {
if bytes.Equal(append(heredocLabel, ';'), searchLabel) {
begin(HEREDOC_END)
l.begin(HEREDOC_END)
tb = l.ungetN(len(heredocLabel)+1)
break;
}
@ -342,7 +344,7 @@ NEW_LINE (\r|\n|\r\n)
lval.token = string(tb); return T_ENCAPSED_AND_WHITESPACE
<HEREDOC_END>{VAR_NAME}\; begin(PHP);lval.token = string(l.ungetN(1)); return T_END_HEREDOC
<HEREDOC_END>{VAR_NAME}\; l.begin(PHP);lval.token = string(l.ungetN(1)); return T_END_HEREDOC
<PHP>[b]?[\"]
binPrefix := l.TokenBytes(nil)[0] == 'b'
@ -352,7 +354,7 @@ NEW_LINE (\r|\n|\r\n)
l.ungetN(len(l.TokenBytes(nil))-cnt)
tokenBytes := l.TokenBytes(nil)[:cnt]
pushState(STRING)
l.pushState(STRING)
lval.token = string(tokenBytes); return rune2Class('"')
}
@ -391,10 +393,10 @@ NEW_LINE (\r|\n|\r\n)
c = l.Next()
}
<STRING>\" popState(); lval.token = "\""; return rune2Class(rune(l.TokenBytes(nil)[0]))
<STRING,HEREDOC,BACKQUOTE>\{\$ lval.token = string(l.ungetN(1)); pushState(PHP); return T_CURLY_OPEN
<STRING,HEREDOC,BACKQUOTE>\$\{ pushState(STRING_VAR_NAME);lval.token = string(l.TokenBytes(nil)); return T_DOLLAR_OPEN_CURLY_BRACES
<STRING,HEREDOC,BACKQUOTE>\$ l.ungetN(1);pushState(STRING_VAR)
<STRING>\" l.popState(); lval.token = "\""; return rune2Class(rune(l.TokenBytes(nil)[0]))
<STRING,HEREDOC,BACKQUOTE>\{\$ lval.token = string(l.ungetN(1)); l.pushState(PHP); return T_CURLY_OPEN
<STRING,HEREDOC,BACKQUOTE>\$\{ l.pushState(STRING_VAR_NAME);lval.token = string(l.TokenBytes(nil)); return T_DOLLAR_OPEN_CURLY_BRACES
<STRING,HEREDOC,BACKQUOTE>\$ l.ungetN(1);l.pushState(STRING_VAR)
<STRING>.
F1:for {
if c == -1 {
@ -484,7 +486,7 @@ NEW_LINE (\r|\n|\r\n)
case '\n': fallthrough
case '\r':
if bytes.Equal(append(heredocLabel, ';'), searchLabel) { // TODO handle ';' as optional
begin(HEREDOC_END)
l.begin(HEREDOC_END)
tb = l.ungetN(len(heredocLabel)+1)
break HEREDOCFOR;
}
@ -524,20 +526,20 @@ NEW_LINE (\r|\n|\r\n)
<STRING_VAR>\${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE
<STRING_VAR>->{VAR_NAME} lval.token = string(l.ungetN(len(l.TokenBytes(nil))-2)); return T_OBJECT_OPERATOR
<STRING_VAR>{VAR_NAME} popState();lval.token = string(l.TokenBytes(nil)); return T_STRING
<STRING_VAR>\[ pushState(STRING_VAR_INDEX);lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<STRING_VAR>.|[ \t\n\r] l.ungetN(1);popState()
<STRING_VAR>{VAR_NAME} l.popState();lval.token = string(l.TokenBytes(nil)); return T_STRING
<STRING_VAR>\[ l.pushState(STRING_VAR_INDEX);lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<STRING_VAR>.|[ \t\n\r] l.ungetN(1);l.popState()
<STRING_VAR_INDEX>{LNUM}|{HNUM}|{BNUM} lval.token = string(l.TokenBytes(nil)); return T_NUM_STRING
<STRING_VAR_INDEX>\${VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_VARIABLE
<STRING_VAR_INDEX>{VAR_NAME} lval.token = string(l.TokenBytes(nil)); return T_STRING
<STRING_VAR_INDEX>\] popState(); popState();lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<STRING_VAR_INDEX>[ \n\r\t\\'#] popState(); popState();lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE
<STRING_VAR_INDEX>\] l.popState(); l.popState();lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<STRING_VAR_INDEX>[ \n\r\t\\'#] l.popState(); l.popState();lval.token = string(l.TokenBytes(nil)); return T_ENCAPSED_AND_WHITESPACE
<STRING_VAR_INDEX>{OPERATORS} lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<STRING_VAR_INDEX>. lval.token = string(l.TokenBytes(nil)); return rune2Class(rune(l.TokenBytes(nil)[0]))
<STRING_VAR_NAME>{VAR_NAME}[\[\}] popState();pushState(PHP);lval.token = string(l.ungetN(1)); return T_STRING_VARNAME
<STRING_VAR_NAME>. l.ungetN(1);popState();pushState(PHP)
<STRING_VAR_NAME>{VAR_NAME}[\[\}] l.popState();l.pushState(PHP);lval.token = string(l.ungetN(1)); return T_STRING_VARNAME
<STRING_VAR_NAME>. l.ungetN(1);l.popState();l.pushState(PHP)
%%
if c, ok := l.Abort(); ok { return int(c) }