libcontainer

package
v1.1.12 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 23, 2024 License: Apache-2.0 Imports: 52 Imported by: 527

README

libcontainer

Go Reference

Libcontainer provides a native Go implementation for creating containers with namespaces, cgroups, capabilities, and filesystem access controls. It allows you to manage the lifecycle of the container performing additional operations after the container is created.

Container

A container is a self contained execution environment that shares the kernel of the host system and which is (optionally) isolated from other containers in the system.

Using libcontainer

Because containers are spawned in a two step process you will need a binary that will be executed as the init process for the container. In libcontainer, we use the current binary (/proc/self/exe) to be executed as the init process, and use arg "init", we call the first step process "bootstrap", so you always need a "init" function as the entry of "bootstrap".

In addition to the go init function the early stage bootstrap is handled by importing nsenter.

import (
	_ "github.com/opencontainers/runc/libcontainer/nsenter"
)

func init() {
	if len(os.Args) > 1 && os.Args[1] == "init" {
		runtime.GOMAXPROCS(1)
		runtime.LockOSThread()
		factory, _ := libcontainer.New("")
		if err := factory.StartInitialization(); err != nil {
			logrus.Fatal(err)
		}
		panic("--this line should have never been executed, congratulations--")
	}
}

Then to create a container you first have to initialize an instance of a factory that will handle the creation and initialization for a container.

factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
if err != nil {
	logrus.Fatal(err)
	return
}

Once you have an instance of the factory created we can create a configuration struct describing how the container is to be created. A sample would look similar to this:

defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
var devices []*configs.DeviceRule
for _, device := range specconv.AllowedDevices {
	devices = append(devices, &device.Rule)
}
config := &configs.Config{
	Rootfs: "/your/path/to/rootfs",
	Capabilities: &configs.Capabilities{
		Bounding: []string{
			"CAP_CHOWN",
			"CAP_DAC_OVERRIDE",
			"CAP_FSETID",
			"CAP_FOWNER",
			"CAP_MKNOD",
			"CAP_NET_RAW",
			"CAP_SETGID",
			"CAP_SETUID",
			"CAP_SETFCAP",
			"CAP_SETPCAP",
			"CAP_NET_BIND_SERVICE",
			"CAP_SYS_CHROOT",
			"CAP_KILL",
			"CAP_AUDIT_WRITE",
		},
		Effective: []string{
			"CAP_CHOWN",
			"CAP_DAC_OVERRIDE",
			"CAP_FSETID",
			"CAP_FOWNER",
			"CAP_MKNOD",
			"CAP_NET_RAW",
			"CAP_SETGID",
			"CAP_SETUID",
			"CAP_SETFCAP",
			"CAP_SETPCAP",
			"CAP_NET_BIND_SERVICE",
			"CAP_SYS_CHROOT",
			"CAP_KILL",
			"CAP_AUDIT_WRITE",
		},
		Permitted: []string{
			"CAP_CHOWN",
			"CAP_DAC_OVERRIDE",
			"CAP_FSETID",
			"CAP_FOWNER",
			"CAP_MKNOD",
			"CAP_NET_RAW",
			"CAP_SETGID",
			"CAP_SETUID",
			"CAP_SETFCAP",
			"CAP_SETPCAP",
			"CAP_NET_BIND_SERVICE",
			"CAP_SYS_CHROOT",
			"CAP_KILL",
			"CAP_AUDIT_WRITE",
		},
		Ambient: []string{
			"CAP_CHOWN",
			"CAP_DAC_OVERRIDE",
			"CAP_FSETID",
			"CAP_FOWNER",
			"CAP_MKNOD",
			"CAP_NET_RAW",
			"CAP_SETGID",
			"CAP_SETUID",
			"CAP_SETFCAP",
			"CAP_SETPCAP",
			"CAP_NET_BIND_SERVICE",
			"CAP_SYS_CHROOT",
			"CAP_KILL",
			"CAP_AUDIT_WRITE",
		},
	},
	Namespaces: configs.Namespaces([]configs.Namespace{
		{Type: configs.NEWNS},
		{Type: configs.NEWUTS},
		{Type: configs.NEWIPC},
		{Type: configs.NEWPID},
		{Type: configs.NEWUSER},
		{Type: configs.NEWNET},
		{Type: configs.NEWCGROUP},
	}),
	Cgroups: &configs.Cgroup{
		Name:   "test-container",
		Parent: "system",
		Resources: &configs.Resources{
			MemorySwappiness: nil,
			Devices:          devices,
		},
	},
	MaskPaths: []string{
		"/proc/kcore",
		"/sys/firmware",
	},
	ReadonlyPaths: []string{
		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
	},
	Devices:  specconv.AllowedDevices,
	Hostname: "testing",
	Mounts: []*configs.Mount{
		{
			Source:      "proc",
			Destination: "/proc",
			Device:      "proc",
			Flags:       defaultMountFlags,
		},
		{
			Source:      "tmpfs",
			Destination: "/dev",
			Device:      "tmpfs",
			Flags:       unix.MS_NOSUID | unix.MS_STRICTATIME,
			Data:        "mode=755",
		},
		{
			Source:      "devpts",
			Destination: "/dev/pts",
			Device:      "devpts",
			Flags:       unix.MS_NOSUID | unix.MS_NOEXEC,
			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
		},
		{
			Device:      "tmpfs",
			Source:      "shm",
			Destination: "/dev/shm",
			Data:        "mode=1777,size=65536k",
			Flags:       defaultMountFlags,
		},
		{
			Source:      "mqueue",
			Destination: "/dev/mqueue",
			Device:      "mqueue",
			Flags:       defaultMountFlags,
		},
		{
			Source:      "sysfs",
			Destination: "/sys",
			Device:      "sysfs",
			Flags:       defaultMountFlags | unix.MS_RDONLY,
		},
	},
	UidMappings: []configs.IDMap{
		{
			ContainerID: 0,
			HostID: 1000,
			Size: 65536,
		},
	},
	GidMappings: []configs.IDMap{
		{
			ContainerID: 0,
			HostID: 1000,
			Size: 65536,
		},
	},
	Networks: []*configs.Network{
		{
			Type:    "loopback",
			Address: "127.0.0.1/0",
			Gateway: "localhost",
		},
	},
	Rlimits: []configs.Rlimit{
		{
			Type: unix.RLIMIT_NOFILE,
			Hard: uint64(1025),
			Soft: uint64(1025),
		},
	},
}

Once you have the configuration populated you can create a container:

container, err := factory.Create("container-id", config)
if err != nil {
	logrus.Fatal(err)
	return
}

To spawn bash as the initial process inside the container and have the processes pid returned in order to wait, signal, or kill the process:

process := &libcontainer.Process{
	Args:   []string{"/bin/bash"},
	Env:    []string{"PATH=/bin"},
	User:   "daemon",
	Stdin:  os.Stdin,
	Stdout: os.Stdout,
	Stderr: os.Stderr,
	Init:   true,
}

err := container.Run(process)
if err != nil {
	container.Destroy()
	logrus.Fatal(err)
	return
}

// wait for the process to finish.
_, err := process.Wait()
if err != nil {
	logrus.Fatal(err)
}

// destroy the container.
container.Destroy()

Additional ways to interact with a running container are:

// return all the pids for all processes running inside the container.
processes, err := container.Processes()

// get detailed cpu, memory, io, and network statistics for the container and
// it's processes.
stats, err := container.Stats()

// pause all processes inside the container.
container.Pause()

// resume all paused processes.
container.Resume()

// send signal to container's init process.
container.Signal(signal)

// update container resource constraints.
container.Set(config)

// get current status of the container.
status, err := container.Status()

// get current container's state information.
state, err := container.State()
Checkpoint & Restore

libcontainer now integrates CRIU for checkpointing and restoring containers. This lets you save the state of a process running inside a container to disk, and then restore that state into a new process, on the same machine or on another machine.

criu version 1.5.2 or higher is required to use checkpoint and restore. If you don't already have criu installed, you can build it from source, following the online instructions. criu is also installed in the docker image generated when building libcontainer with docker.

Code and documentation copyright 2014 Docker, inc. The code and documentation are released under the Apache 2.0 license. The documentation is also released under Creative Commons Attribution 4.0 International License. You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.

Documentation

Overview

Package libcontainer provides a native Go implementation for creating containers with namespaces, cgroups, capabilities, and filesystem access controls. It allows you to manage the lifecycle of the container performing additional operations after the container is created.

Index

Constants

View Source
const (
	InitMsg          uint16 = 62000
	CloneFlagsAttr   uint16 = 27281
	NsPathsAttr      uint16 = 27282
	UidmapAttr       uint16 = 27283
	GidmapAttr       uint16 = 27284
	SetgroupAttr     uint16 = 27285
	OomScoreAdjAttr  uint16 = 27286
	RootlessEUIDAttr uint16 = 27287
	UidmapPathAttr   uint16 = 27288
	GidmapPathAttr   uint16 = 27289
	MountSourcesAttr uint16 = 27290
)

list of known message types we want to send to bootstrap program The number is randomly chosen to not conflict with known netlink types

Variables

View Source
var (
	ErrExist      = errors.New("container with given ID already exists")
	ErrInvalidID  = errors.New("invalid container ID format")
	ErrNotExist   = errors.New("container does not exist")
	ErrPaused     = errors.New("container paused")
	ErrRunning    = errors.New("container still running")
	ErrNotRunning = errors.New("container not running")
	ErrNotPaused  = errors.New("container not paused")
)

Functions

func CriuPath added in v1.0.0

func CriuPath(criupath string) func(*LinuxFactory) error

CriuPath returns an option func to configure a LinuxFactory with the provided criupath

func InitArgs

func InitArgs(args ...string) func(*LinuxFactory) error

InitArgs returns an options func to configure a LinuxFactory with the provided init binary path and arguments.

func NewgidmapPath added in v1.0.0

func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error

NewgidmapPath returns an option func to configure a LinuxFactory with the provided ..

func NewuidmapPath added in v1.0.0

func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error

NewuidmapPath returns an option func to configure a LinuxFactory with the provided ..

func TmpfsRoot

func TmpfsRoot(l *LinuxFactory) error

TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.

Types

type BaseContainer added in v0.0.5

type BaseContainer interface {
	// Returns the ID of the container
	ID() string

	// Returns the current status of the container.
	Status() (Status, error)

	// State returns the current container's state information.
	State() (*State, error)

	// OCIState returns the current container's state information.
	OCIState() (*specs.State, error)

	// Returns the current config of the container.
	Config() configs.Config

	// Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
	//
	// Some of the returned PIDs may no longer refer to processes in the Container, unless
	// the Container state is PAUSED in which case every PID in the slice is valid.
	Processes() ([]int, error)

	// Returns statistics for the container.
	Stats() (*Stats, error)

	// Set resources of container as configured
	//
	// We can use this to change resources when containers are running.
	//
	Set(config configs.Config) error

	// Start a process inside the container. Returns error if process fails to
	// start. You can track process lifecycle with passed Process structure.
	Start(process *Process) (err error)

	// Run immediately starts the process inside the container.  Returns error if process
	// fails to start.  It does not block waiting for the exec fifo  after start returns but
	// opens the fifo after start returns.
	Run(process *Process) (err error)

	// Destroys the container, if its in a valid state, after killing any
	// remaining running processes.
	//
	// Any event registrations are removed before the container is destroyed.
	// No error is returned if the container is already destroyed.
	//
	// Running containers must first be stopped using Signal(..).
	// Paused containers must first be resumed using Resume(..).
	Destroy() error

	// Signal sends the provided signal code to the container's initial process.
	//
	// If all is specified the signal is sent to all processes in the container
	// including the initial process.
	Signal(s os.Signal, all bool) error

	// Exec signals the container to exec the users process at the end of the init.
	Exec() error
}

BaseContainer is a libcontainer container object.

Each container is thread-safe within the same process. Since a container can be destroyed by a separate process, any function may return that the container was not found. BaseContainer includes methods that are platform agnostic.

type BaseState added in v0.0.5

type BaseState struct {
	// ID is the container ID.
	ID string `json:"id"`

	// InitProcessPid is the init process id in the parent namespace.
	InitProcessPid int `json:"init_process_pid"`

	// InitProcessStartTime is the init process start time in clock cycles since boot time.
	InitProcessStartTime uint64 `json:"init_process_start"`

	// Created is the unix timestamp for the creation time of the container in UTC
	Created time.Time `json:"created"`

	// Config is the container's configuration.
	Config configs.Config `json:"config"`
}

BaseState represents the platform agnostic pieces relating to a running container's state

type Boolmsg added in v0.0.9

type Boolmsg struct {
	Type  uint16
	Value bool
}

func (*Boolmsg) Len added in v0.0.9

func (msg *Boolmsg) Len() int

func (*Boolmsg) Serialize added in v0.0.9

func (msg *Boolmsg) Serialize() []byte

type Bytemsg added in v0.0.6

type Bytemsg struct {
	Type  uint16
	Value []byte
}

Bytemsg has the following representation | nlattr len | nlattr type | | value | pad |

func (*Bytemsg) Len added in v0.0.6

func (msg *Bytemsg) Len() int

func (*Bytemsg) Serialize added in v0.0.6

func (msg *Bytemsg) Serialize() []byte

type Container

type Container interface {
	BaseContainer

	// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
	Checkpoint(criuOpts *CriuOpts) error

	// Restore restores the checkpointed container to a running state using the criu(8) utility.
	Restore(process *Process, criuOpts *CriuOpts) error

	// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
	// the execution of any user processes. Asynchronously, when the container finished being paused the
	// state is changed to PAUSED.
	// If the Container state is PAUSED, do nothing.
	Pause() error

	// If the Container state is PAUSED, resumes the execution of any user processes in the
	// Container before setting the Container state to RUNNING.
	// If the Container state is RUNNING, do nothing.
	Resume() error

	// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
	NotifyOOM() (<-chan struct{}, error)

	// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
	NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
}

Container is a libcontainer container object.

Each container is thread-safe within the same process. Since a container can be destroyed by a separate process, any function may return that the container was not found.

type CriuOpts

type CriuOpts struct {
	ImagesDirectory         string             // directory for storing image files
	WorkDirectory           string             // directory to cd and write logs/pidfiles/stats to
	ParentImage             string             // directory for storing parent image files in pre-dump and dump
	LeaveRunning            bool               // leave container in running state after checkpoint
	TcpEstablished          bool               // checkpoint/restore established TCP connections
	ExternalUnixConnections bool               // allow external unix connections
	ShellJob                bool               // allow to dump and restore shell jobs
	FileLocks               bool               // handle file locks, for safety
	PreDump                 bool               // call criu predump to perform iterative checkpoint
	PageServer              CriuPageServerInfo // allow to dump to criu page server
	VethPairs               []VethPairName     // pass the veth to criu when restore
	ManageCgroupsMode       criu.CriuCgMode    // dump or restore cgroup mode
	EmptyNs                 uint32             // don't c/r properties for namespace from this mask
	AutoDedup               bool               // auto deduplication for incremental dumps
	LazyPages               bool               // restore memory pages lazily using userfaultfd
	StatusFd                int                // fd for feedback when lazy server is ready
	LsmProfile              string             // LSM profile used to restore the container
	LsmMountContext         string             // LSM mount context value to use during restore
}

type CriuPageServerInfo

type CriuPageServerInfo struct {
	Address string // IP address of CRIU page server
	Port    int32  // port number of CRIU page server
}

type Factory

type Factory interface {
	// Creates a new container with the given id and starts the initial process inside it.
	// id must be a string containing only letters, digits and underscores and must contain
	// between 1 and 1024 characters, inclusive.
	//
	// The id must not already be in use by an existing container. Containers created using
	// a factory with the same path (and filesystem) must have distinct ids.
	//
	// Returns the new container with a running process.
	//
	// On error, any partially created container parts are cleaned up (the operation is atomic).
	Create(id string, config *configs.Config) (Container, error)

	// Load takes an ID for an existing container and returns the container information
	// from the state.  This presents a read only view of the container.
	Load(id string) (Container, error)

	// StartInitialization is an internal API to libcontainer used during the reexec of the
	// container.
	StartInitialization() error

	// Type returns info string about factory type (e.g. lxc, libcontainer...)
	Type() string
}

func New

func New(root string, options ...func(*LinuxFactory) error) (Factory, error)

New returns a linux based container factory based in the root directory and configures the factory with the provided option funcs.

type IO added in v0.0.7

type IO struct {
	Stdin  io.WriteCloser
	Stdout io.ReadCloser
	Stderr io.ReadCloser
}

IO holds the process's STDIO

type Int32msg added in v0.0.6

type Int32msg struct {
	Type  uint16
	Value uint32
}

func (*Int32msg) Len added in v0.0.6

func (msg *Int32msg) Len() int

func (*Int32msg) Serialize added in v0.0.6

func (msg *Int32msg) Serialize() []byte

Serialize serializes the message. Int32msg has the following representation | nlattr len | nlattr type | | uint32 value |

type LinuxFactory

type LinuxFactory struct {
	// Root directory for the factory to store state.
	Root string

	// InitPath is the path for calling the init responsibilities for spawning
	// a container.
	InitPath string

	// InitArgs are arguments for calling the init responsibilities for spawning
	// a container.
	InitArgs []string

	// CriuPath is the path to the criu binary used for checkpoint and restore of
	// containers.
	CriuPath string

	// New{u,g}idmapPath is the path to the binaries used for mapping with
	// rootless containers.
	NewuidmapPath string
	NewgidmapPath string

	// Validator provides validation to container configurations.
	Validator validate.Validator
}

LinuxFactory implements the default factory interface for linux based systems.

func (*LinuxFactory) Create

func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error)

func (*LinuxFactory) Load

func (l *LinuxFactory) Load(id string) (Container, error)

func (*LinuxFactory) StartInitialization

func (l *LinuxFactory) StartInitialization() (err error)

StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state This is a low level implementation detail of the reexec and should not be consumed externally

func (*LinuxFactory) Type

func (l *LinuxFactory) Type() string

type PressureLevel added in v0.0.7

type PressureLevel uint
const (
	LowPressure PressureLevel = iota
	MediumPressure
	CriticalPressure
)

type Process

type Process struct {
	// The command to be run followed by any arguments.
	Args []string

	// Env specifies the environment variables for the process.
	Env []string

	// User will set the uid and gid of the executing process running inside the container
	// local to the container's user and group configuration.
	User string

	// AdditionalGroups specifies the gids that should be added to supplementary groups
	// in addition to those that the user belongs to.
	AdditionalGroups []string

	// Cwd will change the processes current working directory inside the container's rootfs.
	Cwd string

	// Stdin is a pointer to a reader which provides the standard input stream.
	Stdin io.Reader

	// Stdout is a pointer to a writer which receives the standard output stream.
	Stdout io.Writer

	// Stderr is a pointer to a writer which receives the standard error stream.
	Stderr io.Writer

	// ExtraFiles specifies additional open files to be inherited by the container
	ExtraFiles []*os.File

	// Initial sizings for the console
	ConsoleWidth  uint16
	ConsoleHeight uint16

	// Capabilities specify the capabilities to keep when executing the process inside the container
	// All capabilities not specified will be dropped from the processes capability mask
	Capabilities *configs.Capabilities

	// AppArmorProfile specifies the profile to apply to the process and is
	// changed at the time the process is execed
	AppArmorProfile string

	// Label specifies the label to apply to the process.  It is commonly used by selinux
	Label string

	// NoNewPrivileges controls whether processes can gain additional privileges.
	NoNewPrivileges *bool

	// Rlimits specifies the resource limits, such as max open files, to set in the container
	// If Rlimits are not set, the container will inherit rlimits from the parent process
	Rlimits []configs.Rlimit

	// ConsoleSocket provides the masterfd console.
	ConsoleSocket *os.File

	// Init specifies whether the process is the first process in the container.
	Init bool

	LogLevel string

	// SubCgroupPaths specifies sub-cgroups to run the process in.
	// Map keys are controller names, map values are paths (relative to
	// container's top-level cgroup).
	//
	// If empty, the default top-level container's cgroup is used.
	//
	// For cgroup v2, the only key allowed is "".
	SubCgroupPaths map[string]string
	// contains filtered or unexported fields
}

Process specifies the configuration and IO for a process inside a container.

func (*Process) InitializeIO added in v0.0.7

func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error)

InitializeIO creates pipes for use with the process's stdio and returns the opposite side for each. Do not use this if you want to have a pseudoterminal set up for you by libcontainer (TODO: fix that too). TODO: This is mostly unnecessary, and should be handled by clients.

func (Process) Pid

func (p Process) Pid() (int, error)

Pid returns the process ID

func (Process) Signal

func (p Process) Signal(sig os.Signal) error

Signal sends a signal to the Process.

func (Process) Wait

func (p Process) Wait() (*os.ProcessState, error)

Wait waits for the process to exit. Wait releases any resources associated with the Process

type State

type State struct {
	BaseState

	// Specified if the container was started under the rootless mode.
	// Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
	Rootless bool `json:"rootless"`

	// Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths
	//
	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
	// to the cgroup for this subsystem.
	//
	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
	CgroupPaths map[string]string `json:"cgroup_paths"`

	// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
	// with the value as the path.
	NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`

	// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
	ExternalDescriptors []string `json:"external_descriptors,omitempty"`

	// Intel RDT "resource control" filesystem path
	IntelRdtPath string `json:"intel_rdt_path"`
}

State represents a running container's state

type Stats

type Stats struct {
	Interfaces    []*types.NetworkInterface
	CgroupStats   *cgroups.Stats
	IntelRdtStats *intelrdt.Stats
}

type Status

type Status int

Status is the status of a container.

const (
	// Created is the status that denotes the container exists but has not been run yet.
	Created Status = iota
	// Running is the status that denotes the container exists and is running.
	Running
	// Pausing is the status that denotes the container exists, it is in the process of being paused.
	Pausing
	// Paused is the status that denotes the container exists, but all its processes are paused.
	Paused
	// Stopped is the status that denotes the container does not have a created or running process.
	Stopped
)

func (Status) String added in v0.0.7

func (s Status) String() string

type VethPairName added in v0.0.4

type VethPairName struct {
	ContainerInterfaceName string
	HostInterfaceName      string
}

Directories

Path Synopsis
ebpf/devicefilter
Package devicefilter contains eBPF device filter program
Package devicefilter contains eBPF device filter program
fs
fs2
integration is used for integration testing of libcontainer
integration is used for integration testing of libcontainer
Package specconv implements conversion of specifications to libcontainer configurations
Package specconv implements conversion of specifications to libcontainer configurations

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL