leaves

package module
v1.0.4 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 5, 2023 License: MIT Imports: 15 Imported by: 0

Documentation

Overview

Package leaves is pure Go implemetation of prediction part for GBRT (Gradient Boosting Regression Trees) models from popular frameworks.

General All loaded models exibit the same interface from `Ensemble struct`. One can use method `Name` to get string representation of model origin. Possible name values are "lightgbm.gbdt", "lightgbm.rf", "xgboost.gbtree", "xgboost.gblinear", etc.

LightGBM model

Example: binary classification

build_breast_cancer_model.py:

import lightgbm as lgb
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

n_estimators = 30
d_train = lgb.Dataset(X_train, label=y_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
}
clf = lgb.train(params, d_train, n_estimators)
y_pred = clf.predict(X_test)
y_pred_raw = clf.predict(X_test, raw_score=True)

clf.save_model('lg_breast_cancer.model')  # save the model in txt format
np.savetxt('lg_breast_cancer_true_predictions.txt', y_pred)
np.savetxt('lg_breast_cancer_true_predictions_raw.txt', y_pred_raw)
np.savetxt('breast_cancer_test.tsv', X_test, delimiter='\t')

predict_breast_cancer_model.go:

package main

import (
	"fmt"

	"github.com/DevClusterRu/leaves"
	"github.com/DevClusterRu/leaves/mat"
	"github.com/DevClusterRu/leaves/util"
)

func main() {
	// loading test data
	test, err := mat.DenseMatFromCsvFile("breast_cancer_test.tsv", 0, false, "\t", 0.0)
	if err != nil {
		panic(err)
	}

	// loading model
	model, err := leaves.LGEnsembleFromFile("lg_breast_cancer.model", true)
	if err != nil {
		panic(err)
	}
	fmt.Printf("Name: %s\n", model.Name())
	fmt.Printf("NFeatures: %d\n", model.NFeatures())
	fmt.Printf("NOutputGroups: %d\n", model.NOutputGroups())
	fmt.Printf("NEstimators: %d\n", model.NEstimators())
	fmt.Printf("Transformation: %s\n", model.Transformation().Name())

	// loading true predictions as DenseMat
	truePredictions, err := mat.DenseMatFromCsvFile("lg_breast_cancer_true_predictions.txt", 0, false, "\t", 0.0)
	if err != nil {
		panic(err)
	}
	truePredictionsRaw, err := mat.DenseMatFromCsvFile("lg_breast_cancer_true_predictions_raw.txt", 0, false, "\t", 0.0)
	if err != nil {
		panic(err)
	}

	// preallocate slice to store model predictions
	predictions := make([]float64, test.Rows*model.NOutputGroups())
	// do predictions
	model.PredictDense(test.Values, test.Rows, test.Cols, predictions, 0, 1)
	// compare results
	const tolerance = 1e-6
	if err := util.AlmostEqualFloat64Slices(truePredictions.Values, predictions, tolerance); err != nil {
		panic(fmt.Errorf("different predictions: %s", err.Error()))
	}

	// compare raw predictions (before transformation function)
	rawModel := model.EnsembleWithRawPredictions()
	rawModel.PredictDense(test.Values, test.Rows, test.Cols, predictions, 0, 1)
	if err := util.AlmostEqualFloat64Slices(truePredictionsRaw.Values, predictions, tolerance); err != nil {
		panic(fmt.Errorf("different raw predictions: %s", err.Error()))
	}
	fmt.Println("Predictions the same!")
}

Output:

Name: lightgbm.gbdt
NFeatures: 30
NOutputGroups: 1
NEstimators: 30
Transformation: logistic
Predictions the same!

XGBoost Model

example: Multiclass Classification

build_iris_model.py

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import xgboost as xgb

X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
}
n_estimators = 5
clf = xgb.train(params, xg_train, n_estimators)
# use output_margin=True because of `leaves` predictions are raw scores (before
# transformation function)
y_pred = clf.predict(xg_test, output_margin=True)
# save the model in binary format
clf.save_model('xg_iris.model')
np.savetxt('xg_iris_true_predictions.txt', y_pred, delimiter='\t')
datasets.dump_svmlight_file(X_test, y_test, 'iris_test.libsvm')

predict_iris_model.go:

package main

import (
	"fmt"

	"github.com/DevClusterRu/leaves"
	"github.com/DevClusterRu/leaves/mat"
	"github.com/DevClusterRu/leaves/util"
)

func main() {
	// loading test data
	csr, err := mat.CSRMatFromLibsvmFile("iris_test.libsvm", 0, true)
	if err != nil {
		panic(err)
	}

	// loading model
	model, err := leaves.XGEnsembleFromFile("xg_iris.model", false)
	if err != nil {
		panic(err)
	}
	fmt.Printf("Name: %s\n", model.Name())
	fmt.Printf("NFeatures: %d\n", model.NFeatures())
	fmt.Printf("NOutputGroups: %d\n", model.NOutputGroups())
	fmt.Printf("NEstimators: %d\n", model.NEstimators())

	// loading true predictions as DenseMat
	truePredictions, err := mat.DenseMatFromCsvFile("xg_iris_true_predictions.txt", 0, false, "\t", 0.0)
	if err != nil {
		panic(err)
	}

	// preallocate slice to store model predictions
	predictions := make([]float64, csr.Rows()*model.NOutputGroups())
	// do predictions
	model.PredictCSR(csr.RowHeaders, csr.ColIndexes, csr.Values, predictions, 0, 1)
	// compare results
	const tolerance = 1e-6
	// compare results. Count number of mismatched values beacase of floating point
	// tolerances in decision rule
	mismatch, err := util.NumMismatchedFloat64Slices(truePredictions.Values, predictions, tolerance)
	if err != nil {
		panic(err)
	}
	if mismatch > 2 {
		panic(fmt.Errorf("mismatched more than %d predictions", mismatch))
	}
	fmt.Printf("Predictions the same! (mismatch = %d)\n", mismatch)
}

Output:

Name: xgboost.gbtree
NFeatures: 4
NOutputGroups: 3
NEstimators: 5
Predictions the same! (mismatch = 0)

Notes on XGBoost DART support

Please note that one must not provide nEstimators = 0 when predict with DART models from xgboost. For more details see xgboost's documentation.

Notes on LightGBM DART support

Models trained with 'boosting_type': 'dart' options can be loaded with func `leaves.LGEnsembleFromFile`. But the name of the model (given by `Name()` method) will be 'lightgbm.gbdt', because LightGBM model format doesn't distinguish 'gbdt' and 'dart' models.

Index

Constants

View Source
const BatchSize = 16

BatchSize for parallel task

Variables

This section is empty.

Functions

func GetNLeaves

func GetNLeaves(trees []lgTree) []int

Types

type Ensemble

type Ensemble struct {
	// contains filtered or unexported fields
}

Ensemble is a common wrapper for all models

func LGEnsembleFromFile

func LGEnsembleFromFile(filename string, loadTransformation bool) (*Ensemble, error)

LGEnsembleFromFile reads LightGBM model from binary file

func LGEnsembleFromJSON

func LGEnsembleFromJSON(reader io.Reader, loadTransformation bool) (*Ensemble, error)

LGEnsembleFromJSON reads LightGBM model from stream with JSON data

func LGEnsembleFromReader

func LGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error)

LGEnsembleFromReader reads LightGBM model from `reader`

func SKEnsembleFromFile

func SKEnsembleFromFile(filename string, loadTransformation bool) (*Ensemble, error)

SKEnsembleFromFile reads sklearn tree ensemble model from pickle file

func SKEnsembleFromReader

func SKEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error)

SKEnsembleFromReader reads sklearn tree ensemble model from `reader`

func XGBLinearFromFile

func XGBLinearFromFile(filename string, loadTransformation bool) (*Ensemble, error)

XGBLinearFromFile reads XGBoost's 'gblinear' model from binary file

func XGBLinearFromReader

func XGBLinearFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error)

XGBLinearFromReader reads XGBoost's 'gblinear' model from `reader`

func XGEnsembleFromFile

func XGEnsembleFromFile(filename string, loadTransformation bool) (*Ensemble, error)

XGEnsembleFromFile reads XGBoost model from binary file or json file. Works with 'gbtree' and 'dart' models

func XGEnsembleFromReader

func XGEnsembleFromReader(reader *bufio.Reader, loadTransformation bool) (*Ensemble, error)

XGEnsembleFromReader reads XGBoost model from `reader`. Works with 'gbtree' and 'dart' models

func (*Ensemble) EnsembleWithLeafPredictions

func (e *Ensemble) EnsembleWithLeafPredictions() *Ensemble

EnsembleWithLeafPredictions returns ensemble instance with TransformLeafIndex (return trees indices instead of numerical values)

func (*Ensemble) EnsembleWithRawPredictions

func (e *Ensemble) EnsembleWithRawPredictions() *Ensemble

EnsembleWithRawPredictions returns ensemble instance with TransformRaw (no transformation functions will be applied to the model resulst)

func (*Ensemble) NEstimators

func (e *Ensemble) NEstimators() int

NEstimators returns number of estimators (trees) in ensemble (per group)

func (*Ensemble) NFeatures

func (e *Ensemble) NFeatures() int

NFeatures returns number of features in the model

func (*Ensemble) NLeaves

func (e *Ensemble) NLeaves() []int

NLeaves returns number of leaves in each tree of the ensemble. Returned vector has size NRawOutputGroups() * NEstimators(). For example to get number of leaves in group groupID for estimator estimatorID:

NLeaves()[groupID*NEstimators() + estimatorID].

In case of NRawOutputGroups() == 1 (binary classification or regression):

NLeaves()[estimatorID]

func (*Ensemble) NOutputGroups

func (e *Ensemble) NOutputGroups() int

NOutputGroups returns number of groups (numbers) in every object predictions. For example binary logistic model will give 1, but 4-class prediction model will give 4 numbers per object. This value usually used to preallocate slice for prediction values

func (*Ensemble) NRawOutputGroups

func (e *Ensemble) NRawOutputGroups() int

NRawOutputGroups returns number of groups (numbers) in every object predictions before transformation function applied. This value is provided mainly for information purpose

func (*Ensemble) Name

func (e *Ensemble) Name() string

Name returns name of the estimator

func (*Ensemble) Predict

func (e *Ensemble) Predict(fvals []float64, nEstimators int, predictions []float64) error

Predict calculates single prediction for one or multiclass ensembles. Only `nEstimators` first estimators (trees in most cases) will be used. NOTE: for single class predictions one can use simplified function PredictSingle

func (*Ensemble) PredictCSR

func (e *Ensemble) PredictCSR(indptr []int, cols []int, vals []float64, predictions []float64, nEstimators int, nThreads int) error

PredictCSR calculates predictions from ensemble. `indptr`, `cols`, `vals` represent data structures from Compressed Sparse Row Matrix format (see CSRMat). Only `nEstimators` first estimators (trees) will be used. `nThreads` points to number of threads that will be utilized (maximum is GO_MAX_PROCS) Note, `predictions` slice should be properly allocated on call side

func (*Ensemble) PredictDense

func (e *Ensemble) PredictDense(
	vals []float64,
	nrows int,
	ncols int,
	predictions []float64,
	nEstimators int,
	nThreads int,
) error

PredictDense calculates predictions from ensemble. `vals`, `rows`, `cols` represent data structures from Rom Major Matrix format (see DenseMat). Only `nEstimators` first estimators (trees in most cases) will be used. `nThreads` points to number of threads that will be utilized (maximum is GO_MAX_PROCS) Note, `predictions` slice should be properly allocated on call side

func (*Ensemble) PredictSingle

func (e *Ensemble) PredictSingle(fvals []float64, nEstimators int) float64

PredictSingle calculates prediction for single class model. If ensemble is multiclass, will return quitely 0.0. Only `nEstimators` first estimators (trees in most cases) will be used. If `len(fvals)` is not enough function will quietly return 0.0. NOTE: for multiclass or leaf indices predictions use Predict

func (*Ensemble) Transformation

func (e *Ensemble) Transformation() transformation.Transform

Transformation returns transformation objects which applied to model outputs.

Directories

Path Synopsis
internal
Package mat provides matrix structures and loaders for `leaves` tests.
Package mat provides matrix structures and loaders for `leaves` tests.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL