simplexer

package module
v0.0.0-...-bce8e06 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 10, 2018 License: MIT Imports: 5 Imported by: 1

README

simplexer

Build Status Test Coverage Maintainability GoDoc

A simple lexical analyzser for Go.

example

simplest usage
package main

import (
	"fmt"
	"strings"

	"github.com/macrat/simplexer"
)

func Example() {
	input := "hello_world = \"hello world\"\nnumber = 1"
	lexer := simplexer.NewLexer(strings.NewReader(input))

	fmt.Println(input)
	fmt.Println("==========")

	for {
		token, err := lexer.Scan()
		if err != nil {
			panic(err.Error())
		}
		if token == nil {
			fmt.Println("==========")
			return
		}

		fmt.Printf("line %2d, column %2d: %s: %s\n",
			token.Position.Line,
			token.Position.Column,
			token.Type,
			token.Literal)
	}
}

It is output as follow.

hello_world = "hello world"
number = 1
==========
line  0, column  0: IDENT: hello_world
line  0, column 12: OTHER: =
line  0, column 14: STRING: "hello world"
line  1, column  0: IDENT: number
line  1, column  7: OTHER: =
line  1, column  9: NUMBER: 1
==========
more examples

Please see godoc.

Documentation

Overview

A simple lexical analyzer for Go.

Example
package main

import (
	"fmt"
	"strings"

	"github.com/macrat/simplexer"
)

func main() {
	input := "hello_world = \"hello world\"\nnumber = 1"
	lexer := simplexer.NewLexer(strings.NewReader(input))

	fmt.Println(input)
	fmt.Println("==========")

	for {
		token, err := lexer.Scan()
		if err != nil {
			panic(err.Error())
		}
		if token == nil {
			fmt.Println("==========")
			return
		}

		fmt.Printf("line %2d, column %2d: %s: %s\n",
			token.Position.Line,
			token.Position.Column,
			token.Type,
			token.Literal)
	}

}
Output:

hello_world = "hello world"
number = 1
==========
line  0, column  0: IDENT: hello_world
line  0, column 12: OTHER: =
line  0, column 14: STRING: "hello world"
line  1, column  0: IDENT: number
line  1, column  7: OTHER: =
line  1, column  9: NUMBER: 1
==========
Example (AddOriginalTokenType)
package main

import (
	"fmt"
	"strings"

	"github.com/macrat/simplexer"
)

func main() {
	const (
		SUBSITUATION simplexer.TokenID = iota
		NEWLINE
	)

	input := "hello_world = \"hello world\"\nnumber = 1"
	lexer := simplexer.NewLexer(strings.NewReader(input))

	lexer.Whitespace = simplexer.NewPatternTokenType(-1, []string{"\t", " "})
	// lexer.Whitespace = simplexer.NewRegexpTokenType(-1, `[\t ]`)  // same mean above

	lexer.TokenTypes = append([]simplexer.TokenType{
		simplexer.NewPatternTokenType(SUBSITUATION, []string{"="}),
		simplexer.NewRegexpTokenType(NEWLINE, `^[\n\r]+`),
	}, lexer.TokenTypes...)

	fmt.Println(input)
	fmt.Println("==========")

	for {
		token, err := lexer.Scan()
		if err != nil {
			panic(err.Error())
		}
		if token == nil {
			fmt.Println("==========")
			return
		}

		fmt.Printf("%s: %#v\n", token.Type, token.Literal)
	}

}
Output:

hello_world = "hello world"
number = 1
==========
IDENT: "hello_world"
UNKNOWN(0): "="
STRING: "\"hello world\""
UNKNOWN(1): "\n"
IDENT: "number"
UNKNOWN(0): "="
NUMBER: "1"
==========
Example (PositionInformation)
package main

import (
	"fmt"
	"strings"

	"github.com/macrat/simplexer"
)

func main() {
	input := "this is a\ntest string\n"
	lexer := simplexer.NewLexer(strings.NewReader(input))

	for {
		token, err := lexer.Scan()
		if err != nil {
			panic(err.Error())
		}
		if token == nil {
			break
		}

		fmt.Printf("%d: %s\n", token.Position.Line, lexer.GetLastLine())
		fmt.Printf(" | %s%s\n\n",
			strings.Repeat(" ", token.Position.Column),
			strings.Repeat("=", len(token.Literal)))
	}

}
Output:

0: this is a
 | ====

0: this is a
 |      ==

0: this is a
 |         =

1: test string
 | ====

1: test string
 |      ======

Index

Examples

Constants

This section is empty.

Variables

View Source
var (
	DefaultWhitespace = NewPatternTokenType(-1, []string{" ", "\t", "\r", "\n"})

	DefaultTokenTypes = []TokenType{
		NewRegexpTokenType(IDENT, `[a-zA-Z_][a-zA-Z0-9_]*`),
		NewRegexpTokenType(NUMBER, `[0-9]+(?:\.[0-9]+)?`),
		NewRegexpTokenType(STRING, `\"([^"]*)\"`),
		NewRegexpTokenType(OTHER, `.`),
	}
)

Defined default values for properties of Lexer as a package value.

Functions

This section is empty.

Types

type Lexer

type Lexer struct {
	Whitespace TokenType
	TokenTypes []TokenType
	// contains filtered or unexported fields
}

The lexical analyzer.

Whitespace is a TokenType for skipping characters like whitespaces. The default value is simplexer.DefaultWhitespace. Won't skip any characters if Whitespace is nil.

TokenTypes is an array of TokenType. Lexer will sequential check TokenTypes, and return first matched token. Default is simplexer.DefaultTokenTypes.

Please be careful, Lexer will never use it even if append TokenType after OTHER. Because OTHER will accept any single character.

func NewLexer

func NewLexer(reader io.Reader) *Lexer

Make a new Lexer.

func (*Lexer) GetLastLine

func (l *Lexer) GetLastLine() string

GetCurrentLine returns line of last scanned token.

func (*Lexer) Peek

func (l *Lexer) Peek() (*Token, error)

Peek the first token in the buffer.

Returns nil as *Token if the buffer is empty.

func (*Lexer) Scan

func (l *Lexer) Scan() (*Token, error)

Scan will get the first token in the buffer and remove it from the buffer.

This function using Lexer.Peek. Please read document of Peek.

type PatternTokenType

type PatternTokenType struct {
	ID       TokenID
	Patterns []string
}

PatternTokenType is dictionary token type.

PatternTokenType has some strings and find token that perfect match they.

func NewPatternTokenType

func NewPatternTokenType(id TokenID, patterns []string) *PatternTokenType

Make new PatternTokenType.

id is a TokenID of new PatternTokenType.

patterns is array of patterns.

Example
package main

import (
	"fmt"
	"strings"

	"github.com/macrat/simplexer"
)

func main() {
	const (
		HOGE simplexer.TokenID = iota
		OTHERS
	)

	lexer := simplexer.NewLexer(strings.NewReader("this is hoge and HOGE or Hoge"))

	lexer.TokenTypes = []simplexer.TokenType{
		simplexer.NewPatternTokenType(HOGE, []string{"hoge", "HOGE"}),
		simplexer.NewRegexpTokenType(OTHERS, `[^ ]+`),
	}

	for {
		token, _ := lexer.Scan()
		if token == nil {
			break
		}

		if token.Type.GetID() == HOGE {
			fmt.Printf("!!! %s !!!\n", token.Literal)
		}

		if token.Type.GetID() == OTHERS {
			fmt.Println(token.Literal)
		}
	}

}
Output:

this
is
!!! hoge !!!
and
!!! HOGE !!!
or
Hoge

func (*PatternTokenType) FindToken

func (ptt *PatternTokenType) FindToken(s string, p Position) *Token

FindToken returns new Token if s starts with this token.

func (*PatternTokenType) GetID

func (ptt *PatternTokenType) GetID() TokenID

GetID returns id of token type.

func (*PatternTokenType) String

func (ptt *PatternTokenType) String() string

Get readable string of TokenID.

type Position

type Position struct {
	Line   int
	Column int
}

Position in the file.

func (Position) After

func (p Position) After(x Position) bool

Position.After will check p is after than x.

func (Position) Before

func (p Position) Before(x Position) bool

Position.Before will check p is before than x.

func (Position) String

func (p Position) String() string

Convert to string.

type RegexpTokenType

type RegexpTokenType struct {
	ID TokenID
	Re *regexp.Regexp
}

RegexpTokenType is a TokenType implement with regexp.

ID is TokenID for this token type.

Re is regular expression of token. It have to starts with "^".

func NewRegexpTokenType

func NewRegexpTokenType(id TokenID, re string) *RegexpTokenType

Make new RegexpTokenType.

id is a TokenID of new RegexpTokenType.

re is a regular expression of token.

Example
package main

import (
	"fmt"
	"strings"

	"github.com/macrat/simplexer"
)

func main() {
	const (
		NUMBER simplexer.TokenID = iota
		OTHERS
	)

	lexer := simplexer.NewLexer(strings.NewReader("123this is test456"))

	lexer.TokenTypes = []simplexer.TokenType{
		simplexer.NewRegexpTokenType(NUMBER, `[0-9]+`),
		simplexer.NewRegexpTokenType(OTHERS, `[^0-9]+`),
	}

	for {
		token, _ := lexer.Scan()
		if token == nil {
			break
		}

		if token.Type.GetID() == NUMBER {
			fmt.Printf("%s is number\n", token.Literal)
		}

		if token.Type.GetID() == OTHERS {
			fmt.Printf("%s is not number\n", token.Literal)
		}
	}

}
Output:

123 is number
this is test is not number
456 is number

func (*RegexpTokenType) FindToken

func (rtt *RegexpTokenType) FindToken(s string, p Position) *Token

FindToken returns new Token if s starts with this token.

func (*RegexpTokenType) GetID

func (rtt *RegexpTokenType) GetID() TokenID

GetID returns id of this token type.

func (*RegexpTokenType) String

func (rtt *RegexpTokenType) String() string

Get readable string of TokenID.

type Token

type Token struct {
	Type       TokenType
	Literal    string   // The string of matched.
	Submatches []string // Submatches of regular expression.
	Position   Position // Position of token.
}

A data of found Token.

type TokenID

type TokenID int

TokenID is Identifier for TokenType.

const (
	OTHER TokenID = -(iota + 1)
	IDENT
	NUMBER
	STRING
)

Default token IDs.

func (TokenID) String

func (id TokenID) String() string

Convert to readable string.

Be careful, user added token ID's will convert to UNKNOWN.

type TokenType

type TokenType interface {
	GetID() TokenID
	FindToken(string, Position) *Token
}

TokenType is a rule for making Token.

GetID returns TokenID of this TokenType. TokenID can share with another TokenType.

FindToken returns new Token if the head of first argument was matched with the pattern of this TokenType. The second argument is a position of the token in the buffer. In almost implement, Position will pass into result Token directly.

type UnknownTokenError

type UnknownTokenError struct {
	Literal  string
	Position Position
}

The error that returns when found an unknown token.

func (UnknownTokenError) Error

func (se UnknownTokenError) Error() string

Get error message as string.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL