Documentation ¶
Index ¶
- Constants
- Variables
- func DetectGGMLType(b []byte) string
- func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64)
- func Init() error
- func NewGGUFV3(bo binary.ByteOrder) *gguf
- func ParseFileType(s string) (fileType, error)
- func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, ...) (bool, uint64)
- func Quantize(infile, outfile string, ftype fileType) error
- func SystemInfo() string
- type CompletionRequest
- type CompletionResponse
- type DetokenizeRequest
- type DetokenizeResponse
- type EmbeddingRequest
- type EmbeddingResponse
- type GGML
- type ImageData
- type KV
- func (kv KV) Architecture() string
- func (kv KV) BlockCount() uint64
- func (kv KV) ContextLength() uint64
- func (kv KV) EmbeddingLength() uint64
- func (kv KV) FileType() fileType
- func (kv KV) GQA() uint64
- func (kv KV) HeadCount() uint64
- func (kv KV) HeadCountKV() uint64
- func (kv KV) ParameterCount() uint64
- type Layer
- type LlamaServer
- type ServerStatus
- type ServerStatusResp
- type StatusWriter
- type Tensor
- type Tensors
- type TokenizeRequest
- type TokenizeResponse
Constants ¶
View Source
const ( // Magic constant for `ggml` files (unversioned). FILE_MAGIC_GGML = 0x67676d6c // Magic constant for `ggml` files (versioned, ggmf). FILE_MAGIC_GGMF = 0x67676d66 // Magic constant for `ggml` files (versioned, ggjt). FILE_MAGIC_GGJT = 0x67676a74 // Magic constant for `ggla` files (LoRA adapter). FILE_MAGIC_GGLA = 0x67676C61 // Magic constant for `gguf` files (versioned, gguf) FILE_MAGIC_GGUF_LE = 0x46554747 FILE_MAGIC_GGUF_BE = 0x47475546 )
View Source
const ( GGUFTokenNormal uint32 GGUFTokenUnknown GGUFTokenControl GGUFTokenUserDefined GGUFTokenUnused GGUFTokenByte )
Variables ¶
View Source
var ErrUnsupportedFormat = errors.New("unsupported model format")
Functions ¶
func DetectGGMLType ¶ added in v0.1.35
func EstimateGPULayers ¶ added in v0.1.33
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64)
Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size The GPUs provided must all be the same Library
func ParseFileType ¶ added in v0.1.35
func PredictServerFit ¶ added in v0.1.33
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64)
This algorithm looks for a complete fit to determine if we need to unload other models
func SystemInfo ¶ added in v0.1.32
func SystemInfo() string
SystemInfo is an unused example of calling llama.cpp functions using CGo
Types ¶
type CompletionRequest ¶ added in v0.1.32
type CompletionResponse ¶ added in v0.1.32
type DetokenizeRequest ¶
type DetokenizeRequest struct {
Tokens []int `json:"tokens"`
}
type DetokenizeResponse ¶
type DetokenizeResponse struct {
Content string `json:"content"`
}
type EmbeddingRequest ¶
type EmbeddingRequest struct {
Content string `json:"content"`
}
type EmbeddingResponse ¶
type EmbeddingResponse struct {
Embedding []float64 `json:"embedding"`
}
type GGML ¶
type GGML struct {
// contains filtered or unexported fields
}
func DecodeGGML ¶
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error)
type KV ¶
func (KV) Architecture ¶ added in v0.1.32
func (KV) BlockCount ¶ added in v0.1.32
func (KV) ContextLength ¶ added in v0.1.32
func (KV) EmbeddingLength ¶ added in v0.1.32
func (KV) HeadCountKV ¶ added in v0.1.32
func (KV) ParameterCount ¶ added in v0.1.32
type LlamaServer ¶ added in v0.1.32
type LlamaServer interface { Ping(ctx context.Context) error WaitUntilRunning(ctx context.Context) error Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error Embedding(ctx context.Context, prompt string) ([]float64, error) Tokenize(ctx context.Context, content string) ([]int, error) Detokenize(ctx context.Context, tokens []int) (string, error) Close() error EstimatedVRAM() uint64 EstimatedTotal() uint64 }
func NewLlamaServer ¶ added in v0.1.32
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error)
NewLlamaServer will run a server for the given GPUs The gpu list must be a single family.
type ServerStatus ¶ added in v0.1.32
type ServerStatus int
const ( ServerStatusReady ServerStatus = iota ServerStatusNoSlotsAvailable ServerStatusLoadingModel ServerStatusNotResponding ServerStatusError )
func (ServerStatus) ToString ¶ added in v0.1.33
func (s ServerStatus) ToString() string
type ServerStatusResp ¶ added in v0.1.32
type StatusWriter ¶ added in v0.1.32
type StatusWriter struct { LastErrMsg string // contains filtered or unexported fields }
StatusWriter is a writer that captures error messages from the llama runner process
func NewStatusWriter ¶ added in v0.1.32
func NewStatusWriter(out *os.File) *StatusWriter
type TokenizeRequest ¶
type TokenizeRequest struct {
Content string `json:"content"`
}
type TokenizeResponse ¶
type TokenizeResponse struct {
Tokens []int `json:"tokens"`
}
Source Files ¶
Click to show internal directories.
Click to hide internal directories.