php-parser/scanner/lexer.go

// Package scanner transforms an input string into a stream of PHP tokens.
package scanner

import (
	"bufio"
	"bytes"
	"go/token"
	"io"
	"sync"
	"unicode"

	"github.com/z7zmey/php-parser/position"

	"github.com/cznic/golex/lex"
	"github.com/z7zmey/php-parser/comment"
)

// Allocate Character classes anywhere in [0x80, 0xFF].
const (
	classUnicodeLeter = iota + 0x80
	classUnicodeDigit
	classUnicodeGraphic
	classOther
)

// T_INCLUDE token
const T_INCLUDE = 57346

// T_INCLUDE_ONCE token
const T_INCLUDE_ONCE = 57347

// T_EXIT token
const T_EXIT = 57348

// T_IF token
const T_IF = 57349

// T_LNUMBER token
const T_LNUMBER = 57350

// T_DNUMBER token
const T_DNUMBER = 57351

// T_STRING token
const T_STRING = 57352

// T_STRING_VARNAME token
const T_STRING_VARNAME = 57353

// T_VARIABLE token
const T_VARIABLE = 57354

// T_NUM_STRING token
const T_NUM_STRING = 57355

// T_INLINE_HTML token
const T_INLINE_HTML = 57356

// T_CHARACTER token
const T_CHARACTER = 57357

// T_BAD_CHARACTER token
const T_BAD_CHARACTER = 57358

// T_ENCAPSED_AND_WHITESPACE token
const T_ENCAPSED_AND_WHITESPACE = 57359

// T_CONSTANT_ENCAPSED_STRING token
const T_CONSTANT_ENCAPSED_STRING = 57360

// T_ECHO token
const T_ECHO = 57361

// T_DO token
const T_DO = 57362

// T_WHILE token
const T_WHILE = 57363

// T_ENDWHILE token
const T_ENDWHILE = 57364

// T_FOR token
const T_FOR = 57365

// T_ENDFOR token
const T_ENDFOR = 57366

// T_FOREACH token
const T_FOREACH = 57367

// T_ENDFOREACH token
const T_ENDFOREACH = 57368

// T_DECLARE token
const T_DECLARE = 57369

// T_ENDDECLARE token
const T_ENDDECLARE = 57370

// T_AS token
const T_AS = 57371

// T_SWITCH token
const T_SWITCH = 57372

// T_ENDSWITCH token
const T_ENDSWITCH = 57373

// T_CASE token
const T_CASE = 57374

// T_DEFAULT token
const T_DEFAULT = 57375

// T_BREAK token
const T_BREAK = 57376

// T_CONTINUE token
const T_CONTINUE = 57377

// T_GOTO token
const T_GOTO = 57378

// T_FUNCTION token
const T_FUNCTION = 57379

// T_CONST token
const T_CONST = 57380

// T_RETURN token
const T_RETURN = 57381

// T_TRY token
const T_TRY = 57382

// T_CATCH token
const T_CATCH = 57383

// T_FINALLY token
const T_FINALLY = 57384

// T_THROW token
const T_THROW = 57385

// T_USE token
const T_USE = 57386

// T_INSTEADOF token
const T_INSTEADOF = 57387

// T_GLOBAL token
const T_GLOBAL = 57388

// T_VAR token
const T_VAR = 57389

// T_UNSET token
const T_UNSET = 57390

// T_ISSET token
const T_ISSET = 57391

// T_EMPTY token
const T_EMPTY = 57392

// T_HALT_COMPILER token
const T_HALT_COMPILER = 57393

// T_CLASS token
const T_CLASS = 57394

// T_TRAIT token
const T_TRAIT = 57395

// T_INTERFACE token
const T_INTERFACE = 57396

// T_EXTENDS token
const T_EXTENDS = 57397

// T_IMPLEMENTS token
const T_IMPLEMENTS = 57398

// T_OBJECT_OPERATOR token
const T_OBJECT_OPERATOR = 57399

// T_DOUBLE_ARROW token
const T_DOUBLE_ARROW = 57400

// T_LIST token
const T_LIST = 57401

// T_ARRAY token
const T_ARRAY = 57402

// T_CALLABLE token
const T_CALLABLE = 57403

// T_CLASS_C token
const T_CLASS_C = 57404

// T_TRAIT_C token
const T_TRAIT_C = 57405

// T_METHOD_C token
const T_METHOD_C = 57406

// T_FUNC_C token
const T_FUNC_C = 57407

// T_LINE token
const T_LINE = 57408

// T_FILE token
const T_FILE = 57409

// T_COMMENT token
const T_COMMENT = 57410

// T_DOC_COMMENT token
const T_DOC_COMMENT = 57411

// T_OPEN_TAG token
const T_OPEN_TAG = 57412

// T_OPEN_TAG_WITH_ECHO token
const T_OPEN_TAG_WITH_ECHO = 57413

// T_CLOSE_TAG token
const T_CLOSE_TAG = 57414

// T_WHITESPACE token
const T_WHITESPACE = 57415

// T_START_HEREDOC token
const T_START_HEREDOC = 57416

// T_END_HEREDOC token
const T_END_HEREDOC = 57417

// T_DOLLAR_OPEN_CURLY_BRACES token
const T_DOLLAR_OPEN_CURLY_BRACES = 57418

// T_CURLY_OPEN token
const T_CURLY_OPEN = 57419

// T_PAAMAYIM_NEKUDOTAYIM token
const T_PAAMAYIM_NEKUDOTAYIM = 57420

// T_NAMESPACE token
const T_NAMESPACE = 57421

// T_NS_C token
const T_NS_C = 57422

// T_DIR token
const T_DIR = 57423

// T_NS_SEPARATOR token
const T_NS_SEPARATOR = 57424

// T_ELLIPSIS token
const T_ELLIPSIS = 57425

// T_EVAL token
const T_EVAL = 57426

// T_REQUIRE token
const T_REQUIRE = 57427

// T_REQUIRE_ONCE token
const T_REQUIRE_ONCE = 57428

// T_LOGICAL_OR token
const T_LOGICAL_OR = 57429

// T_LOGICAL_XOR token
const T_LOGICAL_XOR = 57430

// T_LOGICAL_AND token
const T_LOGICAL_AND = 57431

// T_INSTANCEOF token
const T_INSTANCEOF = 57432

// T_NEW token
const T_NEW = 57433

// T_CLONE token
const T_CLONE = 57434

// T_ELSEIF token
const T_ELSEIF = 57435

// T_ELSE token
const T_ELSE = 57436

// T_ENDIF token
const T_ENDIF = 57437

// T_PRINT token
const T_PRINT = 57438

// T_YIELD token
const T_YIELD = 57439

// T_STATIC token
const T_STATIC = 57440

// T_ABSTRACT token
const T_ABSTRACT = 57441

// T_FINAL token
const T_FINAL = 57442

// T_PRIVATE token
const T_PRIVATE = 57443

// T_PROTECTED token
const T_PROTECTED = 57444

// T_PUBLIC token
const T_PUBLIC = 57445

// T_INC token
const T_INC = 57446

// T_DEC token
const T_DEC = 57447

// T_YIELD_FROM token
const T_YIELD_FROM = 57448

// T_INT_CAST token
const T_INT_CAST = 57449

// T_DOUBLE_CAST token
const T_DOUBLE_CAST = 57450

// T_STRING_CAST token
const T_STRING_CAST = 57451

// T_ARRAY_CAST token
const T_ARRAY_CAST = 57452

// T_OBJECT_CAST token
const T_OBJECT_CAST = 57453

// T_BOOL_CAST token
const T_BOOL_CAST = 57454

// T_UNSET_CAST token
const T_UNSET_CAST = 57455

// T_COALESCE token
const T_COALESCE = 57456

// T_SPACESHIP token
const T_SPACESHIP = 57457

// T_NOELSE token
const T_NOELSE = 57458

// T_PLUS_EQUAL token
const T_PLUS_EQUAL = 57459

// T_MINUS_EQUAL token
const T_MINUS_EQUAL = 57460

// T_MUL_EQUAL token
const T_MUL_EQUAL = 57461

// T_POW_EQUAL token
const T_POW_EQUAL = 57462

// T_DIV_EQUAL token
const T_DIV_EQUAL = 57463

// T_CONCAT_EQUAL token
const T_CONCAT_EQUAL = 57464

// T_MOD_EQUAL token
const T_MOD_EQUAL = 57465

// T_AND_EQUAL token
const T_AND_EQUAL = 57466

// T_OR_EQUAL token
const T_OR_EQUAL = 57467

// T_XOR_EQUAL token
const T_XOR_EQUAL = 57468

// T_SL_EQUAL token
const T_SL_EQUAL = 57469

// T_SR_EQUAL token
const T_SR_EQUAL = 57470

// T_BOOLEAN_OR token
const T_BOOLEAN_OR = 57471

// T_BOOLEAN_AND token
const T_BOOLEAN_AND = 57472

// T_POW token
const T_POW = 57473

// T_SL token
const T_SL = 57474

// T_SR token
const T_SR = 57475

// T_IS_IDENTICAL token
const T_IS_IDENTICAL = 57476

// T_IS_NOT_IDENTICAL token
const T_IS_NOT_IDENTICAL = 57477

// T_IS_EQUAL token
const T_IS_EQUAL = 57478

// T_IS_NOT_EQUAL token
const T_IS_NOT_EQUAL = 57479

// T_IS_SMALLER_OR_EQUAL token
const T_IS_SMALLER_OR_EQUAL = 57480

// T_IS_GREATER_OR_EQUAL token
const T_IS_GREATER_OR_EQUAL = 57481

// Lval parsers yySymType must implement this interface
type Lval interface {
	Token(tkn *Token)
}

// Lexer php lexer
type Lexer struct {
	*lex.Lexer
	StateStack    []int
	PhpDocComment string
	Comments      []*comment.Comment
	heredocLabel  string
	tokenBytesBuf *bytes.Buffer
	TokenPool     sync.Pool
	PositionPool  sync.Pool
}

// Rune2Class returns the rune integer id
func Rune2Class(r rune) int {
	if r >= 0 && r < 0x80 { // Keep ASCII as it is.
		return int(r)
	}
	if unicode.IsLetter(r) {
		return classUnicodeLeter
	}
	if unicode.IsDigit(r) {
		return classUnicodeDigit
	}
	if unicode.IsGraphic(r) {
		return classUnicodeGraphic
	}
	// return classOther
	return -1
}

// NewLexer the Lexer constructor
func NewLexer(src io.Reader, fName string) *Lexer {
	file := token.NewFileSet().AddFile(fName, -1, 1<<31-3)
	lx, err := lex.New(file, bufio.NewReader(src), lex.RuneClass(Rune2Class))
	if err != nil {
		panic(err)
	}

	return &Lexer{
		Lexer:         lx,
		StateStack:    []int{0},
		PhpDocComment: "",
		Comments:      nil,
		heredocLabel:  "",
		tokenBytesBuf: &bytes.Buffer{},
		TokenPool: sync.Pool{
			New: func() interface{} { return &Token{} },
		},
		PositionPool: sync.Pool{
			New: func() interface{} { return &position.Position{} },
		},
	}
}

func (l *Lexer) ungetChars(n int) []lex.Char {
	l.Unget(l.Lookahead())

	chars := l.Token()

	for i := 1; i <= n; i++ {
		char := chars[len(chars)-i]
		l.Unget(char)
	}

	buf := l.Token()
	buf = buf[:len(buf)-n]

	return buf
}

func (l *Lexer) pushState(state int) {
	l.StateStack = append(l.StateStack, state)
}

func (l *Lexer) popState() {
	len := len(l.StateStack)
	if len <= 1 {
		return
	}

	l.StateStack = l.StateStack[:len-1]
}

func (l *Lexer) begin(state int) {
	len := len(l.StateStack)
	l.StateStack = l.StateStack[:len-1]
	l.StateStack = append(l.StateStack, state)
}

func (l *Lexer) getCurrentState() int {
	return l.StateStack[len(l.StateStack)-1]
}

func (l *Lexer) createToken(chars []lex.Char) *Token {
	firstChar := chars[0]
	lastChar := chars[len(chars)-1]

	pos := l.PositionPool.Get().(*position.Position)

	pos.StartLine = l.File.Line(firstChar.Pos())
	pos.EndLine = l.File.Line(lastChar.Pos())
	pos.StartPos = int(firstChar.Pos())
	pos.EndPos = int(lastChar.Pos())

	token := l.TokenPool.Get().(*Token)
	token.Position = pos
	token.Comments = l.Comments
	token.Value = l.tokenString(chars)

	return token
}

func (l *Lexer) addComment(chars []lex.Char) {
	firstChar := chars[0]
	lastChar := chars[len(chars)-1]

	pos := position.NewPosition(
		l.File.Line(firstChar.Pos()),
		l.File.Line(lastChar.Pos()),
		int(firstChar.Pos()),
		int(lastChar.Pos()),
	)

	c := comment.NewComment(l.tokenString(chars), pos)
	l.Comments = append(l.Comments, c)
}

func (l *Lexer) tokenString(chars []lex.Char) string {
	l.tokenBytesBuf.Reset()

	for _, c := range chars {
		l.tokenBytesBuf.WriteRune(c.Rune)
	}

	return string(l.tokenBytesBuf.Bytes())
}