php-parser/scanner/lexer.go

539 lines
8.8 KiB
Go
Raw Normal View History

2018-02-20 18:22:15 +00:00
// Package scanner transforms an input string into a stream of PHP tokens.
2018-01-24 16:42:23 +00:00
package scanner
import (
"bufio"
"bytes"
"go/token"
"io"
"unicode"
"github.com/cznic/golex/lex"
"github.com/z7zmey/php-parser/comment"
t "github.com/z7zmey/php-parser/token"
)
// Allocate Character classes anywhere in [0x80, 0xFF].
const (
classUnicodeLeter = iota + 0x80
classUnicodeDigit
classUnicodeGraphic
classOther
)
// T_INCLUDE token
2018-01-24 16:42:23 +00:00
const T_INCLUDE = 57346
2018-02-19 11:36:12 +00:00
// T_INCLUDE_ONCE token
2018-01-24 16:42:23 +00:00
const T_INCLUDE_ONCE = 57347
2018-02-19 11:36:12 +00:00
// T_EXIT token
2018-01-29 14:11:45 +00:00
const T_EXIT = 57348
2018-02-19 11:36:12 +00:00
// T_IF token
2018-01-29 14:11:45 +00:00
const T_IF = 57349
2018-02-19 11:36:12 +00:00
// T_LNUMBER token
2018-01-29 14:11:45 +00:00
const T_LNUMBER = 57350
2018-02-19 11:36:12 +00:00
// T_DNUMBER token
2018-01-29 14:11:45 +00:00
const T_DNUMBER = 57351
2018-02-19 11:36:12 +00:00
// T_STRING token
2018-01-29 14:11:45 +00:00
const T_STRING = 57352
2018-02-19 11:36:12 +00:00
// T_STRING_VARNAME token
2018-01-29 14:11:45 +00:00
const T_STRING_VARNAME = 57353
2018-02-19 11:36:12 +00:00
// T_VARIABLE token
2018-01-29 14:11:45 +00:00
const T_VARIABLE = 57354
2018-02-19 11:36:12 +00:00
// T_NUM_STRING token
2018-01-29 14:11:45 +00:00
const T_NUM_STRING = 57355
2018-02-19 11:36:12 +00:00
// T_INLINE_HTML token
2018-01-29 14:11:45 +00:00
const T_INLINE_HTML = 57356
2018-02-19 11:36:12 +00:00
// T_CHARACTER token
2018-01-29 14:11:45 +00:00
const T_CHARACTER = 57357
2018-02-19 11:36:12 +00:00
// T_BAD_CHARACTER token
2018-01-29 14:11:45 +00:00
const T_BAD_CHARACTER = 57358
2018-02-19 11:36:12 +00:00
// T_ENCAPSED_AND_WHITESPACE token
2018-01-29 14:11:45 +00:00
const T_ENCAPSED_AND_WHITESPACE = 57359
2018-02-19 11:36:12 +00:00
// T_CONSTANT_ENCAPSED_STRING token
2018-01-29 14:11:45 +00:00
const T_CONSTANT_ENCAPSED_STRING = 57360
2018-02-19 11:36:12 +00:00
// T_ECHO token
2018-01-29 14:11:45 +00:00
const T_ECHO = 57361
2018-02-19 11:36:12 +00:00
// T_DO token
2018-01-29 14:11:45 +00:00
const T_DO = 57362
2018-02-19 11:36:12 +00:00
// T_WHILE token
2018-01-29 14:11:45 +00:00
const T_WHILE = 57363
2018-02-19 11:36:12 +00:00
// T_ENDWHILE token
2018-01-29 14:11:45 +00:00
const T_ENDWHILE = 57364
2018-02-19 11:36:12 +00:00
// T_FOR token
2018-01-29 14:11:45 +00:00
const T_FOR = 57365
2018-02-19 11:36:12 +00:00
// T_ENDFOR token
2018-01-29 14:11:45 +00:00
const T_ENDFOR = 57366
2018-02-19 11:36:12 +00:00
// T_FOREACH token
2018-01-29 14:11:45 +00:00
const T_FOREACH = 57367
2018-02-19 11:36:12 +00:00
// T_ENDFOREACH token
2018-01-29 14:11:45 +00:00
const T_ENDFOREACH = 57368
2018-02-19 11:36:12 +00:00
// T_DECLARE token
2018-01-29 14:11:45 +00:00
const T_DECLARE = 57369
2018-02-19 11:36:12 +00:00
// T_ENDDECLARE token
2018-01-29 14:11:45 +00:00
const T_ENDDECLARE = 57370
2018-02-19 11:36:12 +00:00
// T_AS token
2018-01-29 14:11:45 +00:00
const T_AS = 57371
2018-02-19 11:36:12 +00:00
// T_SWITCH token
2018-01-29 14:11:45 +00:00
const T_SWITCH = 57372
2018-02-19 11:36:12 +00:00
// T_ENDSWITCH token
2018-01-29 14:11:45 +00:00
const T_ENDSWITCH = 57373
2018-02-19 11:36:12 +00:00
// T_CASE token
2018-01-29 14:11:45 +00:00
const T_CASE = 57374
2018-02-19 11:36:12 +00:00
// T_DEFAULT token
2018-01-29 14:11:45 +00:00
const T_DEFAULT = 57375
2018-02-19 11:36:12 +00:00
// T_BREAK token
2018-01-29 14:11:45 +00:00
const T_BREAK = 57376
2018-02-19 11:36:12 +00:00
// T_CONTINUE token
2018-01-29 14:11:45 +00:00
const T_CONTINUE = 57377
2018-02-19 11:36:12 +00:00
// T_GOTO token
2018-01-29 14:11:45 +00:00
const T_GOTO = 57378
2018-02-19 11:36:12 +00:00
// T_FUNCTION token
2018-01-29 14:11:45 +00:00
const T_FUNCTION = 57379
2018-02-19 11:36:12 +00:00
// T_CONST token
2018-01-29 14:11:45 +00:00
const T_CONST = 57380
2018-02-19 11:36:12 +00:00
// T_RETURN token
2018-01-29 14:11:45 +00:00
const T_RETURN = 57381
2018-02-19 11:36:12 +00:00
// T_TRY token
2018-01-29 14:11:45 +00:00
const T_TRY = 57382
2018-02-19 11:36:12 +00:00
// T_CATCH token
2018-01-29 14:11:45 +00:00
const T_CATCH = 57383
2018-02-19 11:36:12 +00:00
// T_FINALLY token
2018-01-29 14:11:45 +00:00
const T_FINALLY = 57384
2018-02-19 11:36:12 +00:00
// T_THROW token
2018-01-29 14:11:45 +00:00
const T_THROW = 57385
2018-02-19 11:36:12 +00:00
// T_USE token
2018-01-29 14:11:45 +00:00
const T_USE = 57386
2018-02-19 11:36:12 +00:00
// T_INSTEADOF token
2018-01-29 14:11:45 +00:00
const T_INSTEADOF = 57387
2018-02-19 11:36:12 +00:00
// T_GLOBAL token
2018-01-29 14:11:45 +00:00
const T_GLOBAL = 57388
2018-02-19 11:36:12 +00:00
// T_VAR token
2018-01-29 14:11:45 +00:00
const T_VAR = 57389
2018-02-19 11:36:12 +00:00
// T_UNSET token
2018-01-29 14:11:45 +00:00
const T_UNSET = 57390
2018-02-19 11:36:12 +00:00
// T_ISSET token
2018-01-29 14:11:45 +00:00
const T_ISSET = 57391
2018-02-19 11:36:12 +00:00
// T_EMPTY token
2018-01-29 14:11:45 +00:00
const T_EMPTY = 57392
2018-02-19 11:36:12 +00:00
// T_HALT_COMPILER token
2018-01-29 14:11:45 +00:00
const T_HALT_COMPILER = 57393
2018-02-19 11:36:12 +00:00
// T_CLASS token
2018-01-29 14:11:45 +00:00
const T_CLASS = 57394
2018-02-19 11:36:12 +00:00
// T_TRAIT token
2018-01-29 14:11:45 +00:00
const T_TRAIT = 57395
2018-02-19 11:36:12 +00:00
// T_INTERFACE token
2018-01-29 14:11:45 +00:00
const T_INTERFACE = 57396
2018-02-19 11:36:12 +00:00
// T_EXTENDS token
2018-01-29 14:11:45 +00:00
const T_EXTENDS = 57397
2018-02-19 11:36:12 +00:00
// T_IMPLEMENTS token
2018-01-29 14:11:45 +00:00
const T_IMPLEMENTS = 57398
2018-02-19 11:36:12 +00:00
// T_OBJECT_OPERATOR token
2018-01-29 14:11:45 +00:00
const T_OBJECT_OPERATOR = 57399
2018-02-19 11:36:12 +00:00
// T_DOUBLE_ARROW token
2018-01-29 14:11:45 +00:00
const T_DOUBLE_ARROW = 57400
2018-02-19 11:36:12 +00:00
// T_LIST token
2018-01-29 14:11:45 +00:00
const T_LIST = 57401
2018-02-19 11:36:12 +00:00
// T_ARRAY token
2018-01-29 14:11:45 +00:00
const T_ARRAY = 57402
2018-02-19 11:36:12 +00:00
// T_CALLABLE token
2018-01-29 14:11:45 +00:00
const T_CALLABLE = 57403
2018-02-19 11:36:12 +00:00
// T_CLASS_C token
2018-01-29 14:11:45 +00:00
const T_CLASS_C = 57404
2018-02-19 11:36:12 +00:00
// T_TRAIT_C token
2018-01-29 14:11:45 +00:00
const T_TRAIT_C = 57405
2018-02-19 11:36:12 +00:00
// T_METHOD_C token
2018-01-29 14:11:45 +00:00
const T_METHOD_C = 57406
2018-02-19 11:36:12 +00:00
// T_FUNC_C token
2018-01-29 14:11:45 +00:00
const T_FUNC_C = 57407
2018-02-19 11:36:12 +00:00
// T_LINE token
2018-01-29 14:11:45 +00:00
const T_LINE = 57408
2018-02-19 11:36:12 +00:00
// T_FILE token
2018-01-29 14:11:45 +00:00
const T_FILE = 57409
2018-02-19 11:36:12 +00:00
// T_COMMENT token
2018-01-29 14:11:45 +00:00
const T_COMMENT = 57410
2018-02-19 11:36:12 +00:00
// T_DOC_COMMENT token
2018-01-29 14:11:45 +00:00
const T_DOC_COMMENT = 57411
2018-02-19 11:36:12 +00:00
// T_OPEN_TAG token
2018-01-29 14:11:45 +00:00
const T_OPEN_TAG = 57412
2018-02-19 11:36:12 +00:00
// T_OPEN_TAG_WITH_ECHO token
2018-01-29 14:11:45 +00:00
const T_OPEN_TAG_WITH_ECHO = 57413
2018-02-19 11:36:12 +00:00
// T_CLOSE_TAG token
2018-01-29 14:11:45 +00:00
const T_CLOSE_TAG = 57414
2018-02-19 11:36:12 +00:00
// T_WHITESPACE token
2018-01-29 14:11:45 +00:00
const T_WHITESPACE = 57415
2018-02-19 11:36:12 +00:00
// T_START_HEREDOC token
2018-01-29 14:11:45 +00:00
const T_START_HEREDOC = 57416
2018-02-19 11:36:12 +00:00
// T_END_HEREDOC token
2018-01-29 14:11:45 +00:00
const T_END_HEREDOC = 57417
2018-02-19 11:36:12 +00:00
// T_DOLLAR_OPEN_CURLY_BRACES token
2018-01-29 14:11:45 +00:00
const T_DOLLAR_OPEN_CURLY_BRACES = 57418
2018-02-19 11:36:12 +00:00
// T_CURLY_OPEN token
2018-01-29 14:11:45 +00:00
const T_CURLY_OPEN = 57419
2018-02-19 11:36:12 +00:00
// T_PAAMAYIM_NEKUDOTAYIM token
2018-01-29 14:11:45 +00:00
const T_PAAMAYIM_NEKUDOTAYIM = 57420
2018-02-19 11:36:12 +00:00
// T_NAMESPACE token
2018-01-29 14:11:45 +00:00
const T_NAMESPACE = 57421
2018-02-19 11:36:12 +00:00
// T_NS_C token
2018-01-29 14:11:45 +00:00
const T_NS_C = 57422
2018-02-19 11:36:12 +00:00
// T_DIR token
2018-01-29 14:11:45 +00:00
const T_DIR = 57423
2018-02-19 11:36:12 +00:00
// T_NS_SEPARATOR token
2018-01-29 14:11:45 +00:00
const T_NS_SEPARATOR = 57424
2018-02-19 11:36:12 +00:00
// T_ELLIPSIS token
2018-01-29 14:11:45 +00:00
const T_ELLIPSIS = 57425
2018-02-19 11:36:12 +00:00
// T_EVAL token
2018-01-29 14:11:45 +00:00
const T_EVAL = 57426
2018-02-19 11:36:12 +00:00
// T_REQUIRE token
2018-01-29 14:11:45 +00:00
const T_REQUIRE = 57427
2018-02-19 11:36:12 +00:00
// T_REQUIRE_ONCE token
2018-01-29 14:11:45 +00:00
const T_REQUIRE_ONCE = 57428
2018-02-19 11:36:12 +00:00
// T_LOGICAL_OR token
2018-01-29 14:11:45 +00:00
const T_LOGICAL_OR = 57429
2018-02-19 11:36:12 +00:00
// T_LOGICAL_XOR token
2018-01-29 14:11:45 +00:00
const T_LOGICAL_XOR = 57430
2018-02-19 11:36:12 +00:00
// T_LOGICAL_AND token
2018-01-29 14:11:45 +00:00
const T_LOGICAL_AND = 57431
2018-02-19 11:36:12 +00:00
// T_INSTANCEOF token
2018-01-29 14:11:45 +00:00
const T_INSTANCEOF = 57432
2018-02-19 11:36:12 +00:00
// T_NEW token
2018-01-29 14:11:45 +00:00
const T_NEW = 57433
2018-02-19 11:36:12 +00:00
// T_CLONE token
2018-01-29 14:11:45 +00:00
const T_CLONE = 57434
2018-02-19 11:36:12 +00:00
// T_ELSEIF token
2018-01-29 14:11:45 +00:00
const T_ELSEIF = 57435
2018-02-19 11:36:12 +00:00
// T_ELSE token
2018-01-29 14:11:45 +00:00
const T_ELSE = 57436
2018-02-19 11:36:12 +00:00
// T_ENDIF token
2018-01-29 14:11:45 +00:00
const T_ENDIF = 57437
2018-02-19 11:36:12 +00:00
// T_PRINT token
2018-01-29 14:11:45 +00:00
const T_PRINT = 57438
2018-02-19 11:36:12 +00:00
// T_YIELD token
2018-01-29 14:11:45 +00:00
const T_YIELD = 57439
2018-02-19 11:36:12 +00:00
// T_STATIC token
2018-01-29 14:11:45 +00:00
const T_STATIC = 57440
2018-02-19 11:36:12 +00:00
// T_ABSTRACT token
2018-01-29 14:11:45 +00:00
const T_ABSTRACT = 57441
2018-02-19 11:36:12 +00:00
// T_FINAL token
2018-01-29 14:11:45 +00:00
const T_FINAL = 57442
2018-02-19 11:36:12 +00:00
// T_PRIVATE token
2018-01-29 14:11:45 +00:00
const T_PRIVATE = 57443
2018-02-19 11:36:12 +00:00
// T_PROTECTED token
2018-01-29 14:11:45 +00:00
const T_PROTECTED = 57444
2018-02-19 11:36:12 +00:00
// T_PUBLIC token
2018-01-29 14:11:45 +00:00
const T_PUBLIC = 57445
2018-02-19 11:36:12 +00:00
// T_INC token
2018-01-29 14:11:45 +00:00
const T_INC = 57446
2018-02-19 11:36:12 +00:00
// T_DEC token
2018-01-29 14:11:45 +00:00
const T_DEC = 57447
2018-02-19 11:36:12 +00:00
// T_YIELD_FROM token
2018-01-29 14:11:45 +00:00
const T_YIELD_FROM = 57448
2018-02-19 11:36:12 +00:00
// T_INT_CAST token
2018-01-29 14:11:45 +00:00
const T_INT_CAST = 57449
2018-02-19 11:36:12 +00:00
// T_DOUBLE_CAST token
2018-01-29 14:11:45 +00:00
const T_DOUBLE_CAST = 57450
2018-02-19 11:36:12 +00:00
// T_STRING_CAST token
2018-01-29 14:11:45 +00:00
const T_STRING_CAST = 57451
2018-02-19 11:36:12 +00:00
// T_ARRAY_CAST token
2018-01-29 14:11:45 +00:00
const T_ARRAY_CAST = 57452
2018-02-19 11:36:12 +00:00
// T_OBJECT_CAST token
2018-01-29 14:11:45 +00:00
const T_OBJECT_CAST = 57453
2018-02-19 11:36:12 +00:00
// T_BOOL_CAST token
2018-01-29 14:11:45 +00:00
const T_BOOL_CAST = 57454
2018-02-19 11:36:12 +00:00
// T_UNSET_CAST token
2018-01-29 14:11:45 +00:00
const T_UNSET_CAST = 57455
2018-02-19 11:36:12 +00:00
// T_COALESCE token
2018-01-29 14:11:45 +00:00
const T_COALESCE = 57456
2018-02-19 11:36:12 +00:00
// T_SPACESHIP token
2018-01-29 14:11:45 +00:00
const T_SPACESHIP = 57457
2018-02-19 11:36:12 +00:00
// T_NOELSE token
2018-01-29 14:11:45 +00:00
const T_NOELSE = 57458
2018-02-19 11:36:12 +00:00
// T_PLUS_EQUAL token
2018-01-29 14:11:45 +00:00
const T_PLUS_EQUAL = 57459
2018-02-19 11:36:12 +00:00
// T_MINUS_EQUAL token
2018-01-29 14:11:45 +00:00
const T_MINUS_EQUAL = 57460
2018-02-19 11:36:12 +00:00
// T_MUL_EQUAL token
2018-01-29 14:11:45 +00:00
const T_MUL_EQUAL = 57461
2018-02-19 11:36:12 +00:00
// T_DIV_EQUAL token
2018-01-29 14:11:45 +00:00
const T_DIV_EQUAL = 57462
2018-02-19 11:36:12 +00:00
// T_CONCAT_EQUAL token
2018-01-29 14:11:45 +00:00
const T_CONCAT_EQUAL = 57463
2018-02-19 11:36:12 +00:00
// T_MOD_EQUAL token
2018-01-29 14:11:45 +00:00
const T_MOD_EQUAL = 57464
2018-02-19 11:36:12 +00:00
// T_AND_EQUAL token
2018-01-29 14:11:45 +00:00
const T_AND_EQUAL = 57465
2018-02-19 11:36:12 +00:00
// T_OR_EQUAL token
2018-01-29 14:11:45 +00:00
const T_OR_EQUAL = 57466
2018-02-19 11:36:12 +00:00
// T_XOR_EQUAL token
2018-01-29 14:11:45 +00:00
const T_XOR_EQUAL = 57467
2018-02-19 11:36:12 +00:00
// T_SL_EQUAL token
2018-01-29 14:11:45 +00:00
const T_SL_EQUAL = 57468
2018-02-19 11:36:12 +00:00
// T_SR_EQUAL token
2018-01-29 14:11:45 +00:00
const T_SR_EQUAL = 57469
2018-02-19 11:36:12 +00:00
// T_POW_EQUAL token
2018-01-29 14:11:45 +00:00
const T_POW_EQUAL = 57470
2018-02-19 11:36:12 +00:00
// T_BOOLEAN_OR token
2018-01-29 14:11:45 +00:00
const T_BOOLEAN_OR = 57471
2018-02-19 11:36:12 +00:00
// T_BOOLEAN_AND token
2018-01-29 14:11:45 +00:00
const T_BOOLEAN_AND = 57472
2018-02-19 11:36:12 +00:00
// T_IS_EQUAL token
2018-01-29 14:11:45 +00:00
const T_IS_EQUAL = 57473
2018-02-19 11:36:12 +00:00
// T_IS_NOT_EQUAL token
2018-01-29 14:11:45 +00:00
const T_IS_NOT_EQUAL = 57474
2018-02-19 11:36:12 +00:00
// T_IS_IDENTICAL token
2018-01-29 14:11:45 +00:00
const T_IS_IDENTICAL = 57475
2018-02-19 11:36:12 +00:00
// T_IS_NOT_IDENTICAL token
2018-01-29 14:11:45 +00:00
const T_IS_NOT_IDENTICAL = 57476
2018-02-19 11:36:12 +00:00
// T_IS_SMALLER_OR_EQUAL token
2018-01-29 14:11:45 +00:00
const T_IS_SMALLER_OR_EQUAL = 57477
2018-02-19 11:36:12 +00:00
// T_IS_GREATER_OR_EQUAL token
2018-01-29 14:11:45 +00:00
const T_IS_GREATER_OR_EQUAL = 57478
2018-02-19 11:36:12 +00:00
// T_SL token
2018-01-29 14:11:45 +00:00
const T_SL = 57479
2018-02-19 11:36:12 +00:00
// T_SR token
2018-01-29 14:11:45 +00:00
const T_SR = 57480
2018-02-19 11:36:12 +00:00
// T_POW token
2018-01-29 14:11:45 +00:00
const T_POW = 57481
2018-01-24 16:42:23 +00:00
// Lval parsers yySymType must implement this interface
2018-01-24 16:42:23 +00:00
type Lval interface {
Token(tkn t.Token)
}
// Lexer php lexer
2018-01-24 16:42:23 +00:00
type Lexer struct {
*lex.Lexer
StateStack []int
PhpDocComment string
Comments []comment.Comment
2018-06-05 12:20:23 +00:00
heredocLabel string
tokenBytesBuf *bytes.Buffer
2018-01-24 16:42:23 +00:00
}
// Rune2Class returns the rune integer id
2018-01-24 16:42:23 +00:00
func Rune2Class(r rune) int {
if r >= 0 && r < 0x80 { // Keep ASCII as it is.
return int(r)
}
if unicode.IsLetter(r) {
return classUnicodeLeter
}
if unicode.IsDigit(r) {
return classUnicodeDigit
}
if unicode.IsGraphic(r) {
return classUnicodeGraphic
}
// return classOther
return -1
}
// NewLexer the Lexer constructor
2018-01-24 16:42:23 +00:00
func NewLexer(src io.Reader, fName string) *Lexer {
file := token.NewFileSet().AddFile(fName, -1, 1<<31-3)
2018-01-24 16:42:23 +00:00
lx, err := lex.New(file, bufio.NewReader(src), lex.RuneClass(Rune2Class))
if err != nil {
panic(err)
}
2018-06-05 12:20:23 +00:00
return &Lexer{lx, []int{0}, "", nil, "", &bytes.Buffer{}}
2018-01-24 16:42:23 +00:00
}
func (l *Lexer) ungetChars(n int) []lex.Char {
l.Unget(l.Lookahead())
chars := l.Token()
for i := 1; i <= n; i++ {
char := chars[len(chars)-i]
l.Unget(char)
}
buf := l.Token()
buf = buf[:len(buf)-n]
return buf
}
func (l *Lexer) pushState(state int) {
l.StateStack = append(l.StateStack, state)
}
func (l *Lexer) popState() {
len := len(l.StateStack)
if len <= 1 {
return
}
l.StateStack = l.StateStack[:len-1]
}
func (l *Lexer) begin(state int) {
len := len(l.StateStack)
l.StateStack = l.StateStack[:len-1]
l.StateStack = append(l.StateStack, state)
}
func (l *Lexer) getCurrentState() int {
return l.StateStack[len(l.StateStack)-1]
}
func (l *Lexer) newToken(chars []lex.Char) t.Token {
firstChar := chars[0]
lastChar := chars[len(chars)-1]
startLine := l.File.Line(firstChar.Pos())
endLine := l.File.Line(lastChar.Pos())
startPos := int(firstChar.Pos())
endPos := int(lastChar.Pos())
2018-06-05 12:20:23 +00:00
return t.NewToken(l.tokenString(chars), startLine, endLine, startPos, endPos).SetComments(l.Comments)
2018-01-24 16:42:23 +00:00
}
func (l *Lexer) addComment(c comment.Comment) {
l.Comments = append(l.Comments, c)
}
2018-06-05 12:20:23 +00:00
func (l *Lexer) tokenString(chars []lex.Char) string {
l.tokenBytesBuf.Reset()
2018-01-24 16:42:23 +00:00
for _, c := range chars {
l.tokenBytesBuf.WriteRune(c.Rune)
2018-01-24 16:42:23 +00:00
}
2018-06-05 12:20:23 +00:00
return string(l.tokenBytesBuf.Bytes())
2018-01-24 16:42:23 +00:00
}