Documentation ¶
Overview ¶
A simple lexical analyzer for Go.
Example ¶
package main import ( "fmt" "strings" "github.com/macrat/simplexer" ) func main() { input := "hello_world = \"hello world\"\nnumber = 1" lexer := simplexer.NewLexer(strings.NewReader(input)) fmt.Println(input) fmt.Println("==========") for { token, err := lexer.Scan() if err != nil { panic(err.Error()) } if token == nil { fmt.Println("==========") return } fmt.Printf("line %2d, column %2d: %s: %s\n", token.Position.Line, token.Position.Column, token.Type, token.Literal) } }
Output: hello_world = "hello world" number = 1 ========== line 0, column 0: IDENT: hello_world line 0, column 12: OTHER: = line 0, column 14: STRING: "hello world" line 1, column 0: IDENT: number line 1, column 7: OTHER: = line 1, column 9: NUMBER: 1 ==========
Example (AddOriginalTokenType) ¶
package main import ( "fmt" "strings" "github.com/macrat/simplexer" ) func main() { const ( SUBSITUATION simplexer.TokenID = iota NEWLINE ) input := "hello_world = \"hello world\"\nnumber = 1" lexer := simplexer.NewLexer(strings.NewReader(input)) lexer.Whitespace = simplexer.NewPatternTokenType(-1, []string{"\t", " "}) // lexer.Whitespace = simplexer.NewRegexpTokenType(-1, `[\t ]`) // same mean above lexer.TokenTypes = append([]simplexer.TokenType{ simplexer.NewPatternTokenType(SUBSITUATION, []string{"="}), simplexer.NewRegexpTokenType(NEWLINE, `^[\n\r]+`), }, lexer.TokenTypes...) fmt.Println(input) fmt.Println("==========") for { token, err := lexer.Scan() if err != nil { panic(err.Error()) } if token == nil { fmt.Println("==========") return } fmt.Printf("%s: %#v\n", token.Type, token.Literal) } }
Output: hello_world = "hello world" number = 1 ========== IDENT: "hello_world" UNKNOWN(0): "=" STRING: "\"hello world\"" UNKNOWN(1): "\n" IDENT: "number" UNKNOWN(0): "=" NUMBER: "1" ==========
Example (PositionInformation) ¶
package main import ( "fmt" "strings" "github.com/macrat/simplexer" ) func main() { input := "this is a\ntest string\n" lexer := simplexer.NewLexer(strings.NewReader(input)) for { token, err := lexer.Scan() if err != nil { panic(err.Error()) } if token == nil { break } fmt.Printf("%d: %s\n", token.Position.Line, lexer.GetLastLine()) fmt.Printf(" | %s%s\n\n", strings.Repeat(" ", token.Position.Column), strings.Repeat("=", len(token.Literal))) } }
Output: 0: this is a | ==== 0: this is a | == 0: this is a | = 1: test string | ==== 1: test string | ======
Index ¶
Examples ¶
Constants ¶
This section is empty.
Variables ¶
var ( DefaultWhitespace = NewPatternTokenType(-1, []string{" ", "\t", "\r", "\n"}) DefaultTokenTypes = []TokenType{ NewRegexpTokenType(IDENT, `[a-zA-Z_][a-zA-Z0-9_]*`), NewRegexpTokenType(NUMBER, `[0-9]+(?:\.[0-9]+)?`), NewRegexpTokenType(STRING, `\"([^"]*)\"`), NewRegexpTokenType(OTHER, `.`), } )
Defined default values for properties of Lexer as a package value.
Functions ¶
This section is empty.
Types ¶
type Lexer ¶
type Lexer struct { Whitespace TokenType TokenTypes []TokenType // contains filtered or unexported fields }
The lexical analyzer.
Whitespace is a TokenType for skipping characters like whitespaces. The default value is simplexer.DefaultWhitespace. Won't skip any characters if Whitespace is nil.
TokenTypes is an array of TokenType. Lexer will sequential check TokenTypes, and return first matched token. Default is simplexer.DefaultTokenTypes.
Please be careful, Lexer will never use it even if append TokenType after OTHER. Because OTHER will accept any single character.
func (*Lexer) GetLastLine ¶
GetCurrentLine returns line of last scanned token.
type PatternTokenType ¶
PatternTokenType is dictionary token type.
PatternTokenType has some strings and find token that perfect match they.
func NewPatternTokenType ¶
func NewPatternTokenType(id TokenID, patterns []string) *PatternTokenType
Make new PatternTokenType.
id is a TokenID of new PatternTokenType.
patterns is array of patterns.
Example ¶
package main import ( "fmt" "strings" "github.com/macrat/simplexer" ) func main() { const ( HOGE simplexer.TokenID = iota OTHERS ) lexer := simplexer.NewLexer(strings.NewReader("this is hoge and HOGE or Hoge")) lexer.TokenTypes = []simplexer.TokenType{ simplexer.NewPatternTokenType(HOGE, []string{"hoge", "HOGE"}), simplexer.NewRegexpTokenType(OTHERS, `[^ ]+`), } for { token, _ := lexer.Scan() if token == nil { break } if token.Type.GetID() == HOGE { fmt.Printf("!!! %s !!!\n", token.Literal) } if token.Type.GetID() == OTHERS { fmt.Println(token.Literal) } } }
Output: this is !!! hoge !!! and !!! HOGE !!! or Hoge
func (*PatternTokenType) FindToken ¶
func (ptt *PatternTokenType) FindToken(s string, p Position) *Token
FindToken returns new Token if s starts with this token.
func (*PatternTokenType) GetID ¶
func (ptt *PatternTokenType) GetID() TokenID
GetID returns id of token type.
func (*PatternTokenType) String ¶
func (ptt *PatternTokenType) String() string
Get readable string of TokenID.
type Position ¶
Position in the file.
type RegexpTokenType ¶
RegexpTokenType is a TokenType implement with regexp.
ID is TokenID for this token type.
Re is regular expression of token. It have to starts with "^".
func NewRegexpTokenType ¶
func NewRegexpTokenType(id TokenID, re string) *RegexpTokenType
Make new RegexpTokenType.
id is a TokenID of new RegexpTokenType.
re is a regular expression of token.
Example ¶
package main import ( "fmt" "strings" "github.com/macrat/simplexer" ) func main() { const ( NUMBER simplexer.TokenID = iota OTHERS ) lexer := simplexer.NewLexer(strings.NewReader("123this is test456")) lexer.TokenTypes = []simplexer.TokenType{ simplexer.NewRegexpTokenType(NUMBER, `[0-9]+`), simplexer.NewRegexpTokenType(OTHERS, `[^0-9]+`), } for { token, _ := lexer.Scan() if token == nil { break } if token.Type.GetID() == NUMBER { fmt.Printf("%s is number\n", token.Literal) } if token.Type.GetID() == OTHERS { fmt.Printf("%s is not number\n", token.Literal) } } }
Output: 123 is number this is test is not number 456 is number
func (*RegexpTokenType) FindToken ¶
func (rtt *RegexpTokenType) FindToken(s string, p Position) *Token
FindToken returns new Token if s starts with this token.
func (*RegexpTokenType) GetID ¶
func (rtt *RegexpTokenType) GetID() TokenID
GetID returns id of this token type.
func (*RegexpTokenType) String ¶
func (rtt *RegexpTokenType) String() string
Get readable string of TokenID.
type Token ¶
type Token struct { Type TokenType Literal string // The string of matched. Submatches []string // Submatches of regular expression. Position Position // Position of token. }
A data of found Token.
type TokenType ¶
TokenType is a rule for making Token.
GetID returns TokenID of this TokenType. TokenID can share with another TokenType.
FindToken returns new Token if the head of first argument was matched with the pattern of this TokenType. The second argument is a position of the token in the buffer. In almost implement, Position will pass into result Token directly.
type UnknownTokenError ¶
The error that returns when found an unknown token.
func (UnknownTokenError) Error ¶
func (se UnknownTokenError) Error() string
Get error message as string.