leaves: github.com/dmitryikh/leaves Index | Files | Directories

package leaves

import "github.com/dmitryikh/leaves"

Package leaves is pure Go implemetation of prediction part for GBRT (Gradient Boosting Regression Trees) models from popular frameworks.

General All loaded models exibit the same interface from `Ensemble struct`. One can use method `Name` to get string representation of model origin. Possible name values are "lightgbm.gbdt", "lightgbm.rf", "xgboost.gbtree", "xgboost.gblinear", etc.

LightGBM model

Example: binary classification

build_breast_cancer_model.py:

import lightgbm as lgb
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

n_estimators = 30
d_train = lgb.Dataset(X_train, label=y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
}
clf = lgb.train(params, d_train, n_estimators)
y_pred = clf.predict(X_test)
y_pred_raw = clf.predict(X_test, raw_score=True)

clf.save_model('lg_breast_cancer.model')  # save the model in txt format
np.savetxt('lg_breast_cancer_true_predictions.txt', y_pred)
np.savetxt('lg_breast_cancer_true_predictions_raw.txt', y_pred_raw)
np.savetxt('breast_cancer_test.tsv', X_test, delimiter='\t')

predict_breast_cancer_model.go:

package main

import (
	"fmt"

	"github.com/dmitryikh/leaves"
	"github.com/dmitryikh/leaves/mat"
	"github.com/dmitryikh/leaves/util"
)

func main() {
	// loading test data
	test, err := mat.DenseMatFromCsvFile("breast_cancer_test.tsv", 0, false, "\t", 0.0)
	if err != nil {
		panic(err)
	}

	// loading model
	model, err := leaves.LGEnsembleFromFile("lg_breast_cancer.model", true)
	if err != nil {
		panic(err)
	}
	fmt.Printf("Name: %s\n", model.Name())
	fmt.Printf("NFeatures: %d\n", model.NFeatures())
	fmt.Printf("NOutputGroups: %d\n", model.NOutputGroups())
	fmt.Printf("NEstimators: %d\n", model.NEstimators())
	fmt.Printf("Transformation: %s\n", model.Transformation().Name())

	// loading true predictions as DenseMat
	truePredictions, err := mat.DenseMatFromCsvFile("lg_breast_cancer_true_predictions.txt", 0, false, "\t", 0.0)
	if err != nil {
		panic(err)
	}
	truePredictionsRaw, err := mat.DenseMatFromCsvFile("lg_breast_cancer_true_predictions_raw.txt", 0, false, "\t", 0.0)
	if err != nil {
		panic(err)
	}

	// preallocate slice to store model predictions
	predictions := make([]float64, test.Rows*model.NOutputGroups())
	// do predictions
	model.PredictDense(test.Values, test.Rows, test.Cols, predictions, 0, 1)
	// compare results
	const tolerance = 1e-6
	if err := util.AlmostEqualFloat64Slices(truePredictions.Values, predictions, tolerance); err != nil {
		panic(fmt.Errorf("different predictions: %s", err.Error()))
	}

	// compare raw predictions (before transformation function)
	rawModel := model.EnsembleWithRawPredictions()
	rawModel.PredictDense(test.Values, test.Rows, test.Cols, predictions, 0, 1)
	if err := util.AlmostEqualFloat64Slices(truePredictionsRaw.Values, predictions, tolerance); err != nil {
		panic(fmt.Errorf("different raw predictions: %s", err.Error()))
	}
	fmt.Println("Predictions the same!")
}

Output:

Name: lightgbm.gbdt
NFeatures: 30
NOutputGroups: 1
NEstimators: 30
Transformation: logistic
Predictions the same!

XGBoost Model

example: Multiclass Classification

build_iris_model.py

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import xgboost as xgb

X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
}
n_estimators = 5
clf = xgb.train(params, xg_train, n_estimators)
# use output_margin=True because of `leaves` predictions are raw scores (before
# transformation function)
y_pred = clf.predict(xg_test, output_margin=True)
# save the model in binary format
clf.save_model('xg_iris.model')
np.savetxt('xg_iris_true_predictions.txt', y_pred, delimiter='\t')
datasets.dump_svmlight_file(X_test, y_test, 'iris_test.libsvm')

predict_iris_model.go:

package main

import (
	"fmt"

	"github.com/dmitryikh/leaves"
	"github.com/dmitryikh/leaves/mat"
	"github.com/dmitryikh/leaves/util"
)

func main() {
	// loading test data
	csr, err := mat.CSRMatFromLibsvmFile("iris_test.libsvm", 0, true)
	if err != nil {
		panic(err)
	}

	// loading model
	model, err := leaves.XGEnsembleFromFile("xg_iris.model", false)
	if err != nil {
		panic(err)
	}
	fmt.Printf("Name: %s\n", model.Name())
	fmt.Printf("NFeatures: %d\n", model.NFeatures())
	fmt.Printf("NOutputGroups: %d\n", model.NOutputGroups())
	fmt.Printf("NEstimators: %d\n", model.NEstimators())

	// loading true predictions as DenseMat
	truePredictions, err := mat.DenseMatFromCsvFile("xg_iris_true_predictions.txt", 0, false, "\t", 0.0)
	if err != nil {
		panic(err)
	}

	// preallocate slice to store model predictions
	predictions := make([]float64, csr.Rows()*model.NOutputGroups())
	// do predictions
	model.PredictCSR(csr.RowHeaders, csr.ColIndexes, csr.Values, predictions, 0, 1)
	// compare results
	const tolerance = 1e-6
	// compare results. Count number of mismatched values beacase of floating point
	// tolerances in decision rule
	mismatch, err := util.NumMismatchedFloat64Slices(truePredictions.Values, predictions, tolerance)
	if err != nil {
		panic(err)
	}
	if mismatch > 2 {
		panic(fmt.Errorf("mismatched more than %d predictions", mismatch))
	}
	fmt.Printf("Predictions the same! (mismatch = %d)\n", mismatch)
}

Output:

Name: xgboost.gbtree
NFeatures: 4
NOutputGroups: 3
NEstimators: 5
Predictions the same! (mismatch = 0)

Notes on XGBoost DART support

Please note that one must not provide nEstimators = 0 when predict with DART models from xgboost. For more details see xgboost's documentation.

Notes on LightGBM DART support

Models trained with 'boosting_type': 'dart' options can be loaded with func `leaves.LGEnsembleFromFile`. But the name of the model (given by `Name()` method) will be 'lightgbm.gbdt', because LightGBM model format doesn't distinguish 'gbdt' and 'dart' models.

Index

Package Files

doc.go leaves.go lgensemble.go lgensemble_io.go lgtree.go skensemble_io.go xgblinear.go xgblinear_io.go xgensemble.go xgensemble_io.go

Constants

const BatchSize = 16

BatchSize for parallel task

type Ensemble Uses

type Ensemble struct {
    // contains filtered or unexported fields
}

Ensemble is a common wrapper for all models

func LGEnsembleFromFile Uses

func LGEnsembleFromFile(filename string, loadTransformation bool) (*Ensemble, error)

LGEnsembleFromFile reads LightGBM model from binary file

func LGEnsembleFromJSON Uses

func LGEnsembleFromJSON(reader io.Reader, loadTransformation bool) (*Ensemble, error)

LGEnsembleFromJSON reads LightGBM model from stream with JSON data

func LGEnsembleFromReader Uses

func LGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error)

LGEnsembleFromReader reads LightGBM model from `reader`

func SKEnsembleFromFile Uses

func SKEnsembleFromFile(filename string, loadTransformation bool) (*Ensemble, error)

SKEnsembleFromFile reads sklearn tree ensemble model from pickle file

func SKEnsembleFromReader Uses

func SKEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error)

SKEnsembleFromReader reads sklearn tree ensemble model from `reader`

func XGBLinearFromFile Uses

func XGBLinearFromFile(filename string, loadTransformation bool) (*Ensemble, error)

XGBLinearFromFile reads XGBoost's 'gblinear' model from binary file

func XGBLinearFromReader Uses

func XGBLinearFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error)

XGBLinearFromReader reads XGBoost's 'gblinear' model from `reader`

func XGEnsembleFromFile Uses

func XGEnsembleFromFile(filename string, loadTransformation bool) (*Ensemble, error)

XGEnsembleFromFile reads XGBoost model from binary file. Works with 'gbtree' and 'dart' models

func XGEnsembleFromReader Uses

func XGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error)

XGEnsembleFromReader reads XGBoost model from `reader`. Works with 'gbtree' and 'dart' models

func (*Ensemble) EnsembleWithRawPredictions Uses

func (e *Ensemble) EnsembleWithRawPredictions() *Ensemble

EnsembleWithRawPredictions returns ensemble instance with TransformRaw (no transformation functions will be applied to the model resulst)

func (*Ensemble) NEstimators Uses

func (e *Ensemble) NEstimators() int

NEstimators returns number of estimators (trees) in ensemble (per group)

func (*Ensemble) NFeatures Uses

func (e *Ensemble) NFeatures() int

NFeatures returns number of features in the model

func (*Ensemble) NOutputGroups Uses

func (e *Ensemble) NOutputGroups() int

NOutputGroups returns number of groups (numbers) in every object predictions. For example binary logistic model will give 1, but 4-class prediction model will give 4 numbers per object. This value usually used to preallocate slice for prediction values

func (*Ensemble) NRawOutputGroups Uses

func (e *Ensemble) NRawOutputGroups() int

NRawOutputGroups returns number of groups (numbers) in every object predictions before transformation function applied. This value is provided mainly for information purpose

func (*Ensemble) Name Uses

func (e *Ensemble) Name() string

Name returns name of the estimator

func (*Ensemble) Predict Uses

func (e *Ensemble) Predict(fvals []float64, nEstimators int, predictions []float64) error

Predict calculates single prediction for one or multiclass ensembles. Only `nEstimators` first estimators (trees in most cases) will be used. NOTE: for single class predictions one can use simplified function PredictSingle

func (*Ensemble) PredictCSR Uses

func (e *Ensemble) PredictCSR(indptr []int, cols []int, vals []float64, predictions []float64, nEstimators int, nThreads int) error

PredictCSR calculates predictions from ensemble. `indptr`, `cols`, `vals` represent data structures from Compressed Sparse Row Matrix format (see CSRMat). Only `nEstimators` first estimators (trees) will be used. `nThreads` points to number of threads that will be utilized (maximum is GO_MAX_PROCS) Note, `predictions` slice should be properly allocated on call side

func (*Ensemble) PredictDense Uses

func (e *Ensemble) PredictDense(
    vals []float64,
    nrows int,
    ncols int,
    predictions []float64,
    nEstimators int,
    nThreads int,
) error

PredictDense calculates predictions from ensemble. `vals`, `rows`, `cols` represent data structures from Rom Major Matrix format (see DenseMat). Only `nEstimators` first estimators (trees in most cases) will be used. `nThreads` points to number of threads that will be utilized (maximum is GO_MAX_PROCS) Note, `predictions` slice should be properly allocated on call side

func (*Ensemble) PredictSingle Uses

func (e *Ensemble) PredictSingle(fvals []float64, nEstimators int) float64

PredictSingle calculates prediction for single class model. If ensemble is multiclass, will return quitely 0.0. Only `nEstimators` first estimators (trees in most cases) will be used. If `len(fvals)` is not enough function will quietly return 0.0. NOTE: for multiclass prediction use Predict

func (*Ensemble) Transformation Uses

func (e *Ensemble) Transformation() transformation.Transform

Transformation returns transformation objects which applied to model outputs.

Directories

PathSynopsis
internal/pickle
internal/xgbin
matPackage mat provides matrix structures and loaders for `leaves` tests.
transformation
util

Package leaves imports 14 packages (graph). Updated 2019-03-31. Refresh now. Tools for package owners.