simd

package module
v1.1.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 5, 2022 License: MIT Imports: 2 Imported by: 4

README

kelindar/simd
Go Version PkgGoDev License Coverage

Vectorized Math Functions

This library contains a set of vectorized mathematical functions which were auto-vectorized using clang compiler and translated into PLAN9 assembly code for Go. Generic version is also provided for CPUs where vectorization is not available, or for which this library doesn't have a generated code.

It currently supports only AVX2, but AVX512 and SVE (for ARM) should be easy enough to generate. Most of the code in this library is auto-generated, which helps with maintenance.

Usage

Usage of this library is very straightforward, it simply exposes a set of non-opinionated functions. For example, if you want to sum up a slice of floating point numbers, you can use SumFloat32s([]float32) float32 function to do so.

sum := simd.SumFloat32s([]float32{1, 2, 3, 4, 5})

Benchmarks

goos: windows
goarch: amd64
pkg: github.com/kelindar/simd
cpu: Intel(R) Core(TM) i7-9700K CPU @ 3.60GHz

   TYPE    OP    SIZE     RATE        SPEEDUP
float32   sum     256     8.96 ns/op   20.51x
float32   min     256    27.78 ns/op    4.15x
float32   max     256    25.94 ns/op    4.43x
float32   add     256    15.31 ns/op    7.46x
float32   sub     256    15.48 ns/op    7.34x
float32   mul     256    15.36 ns/op    7.54x
float32   div     256    27.88 ns/op    5.79x
float32   sum    4096   107.92 ns/op   31.60x
float32   min    4096   134.06 ns/op   13.01x
float32   max    4096   133.90 ns/op   12.94x
float32   add    4096   353.01 ns/op    4.90x
float32   sub    4096   353.62 ns/op    4.92x
float32   mul    4096   353.64 ns/op    4.87x
float32   div    4096   455.60 ns/op    5.68x
float32   sum   16384   469.18 ns/op   29.44x
float32   min   16384   497.91 ns/op   13.94x
float32   max   16384   500.43 ns/op   13.84x
float32   add   16384  1442.28 ns/op    4.82x
float32   sub   16384  1366.98 ns/op    5.07x
float32   mul   16384  1382.20 ns/op    4.97x
float32   div   16384  1821.82 ns/op    5.71x

   TYPE    OP    SIZE     RATE        SPEEDUP
float64   sum     256    14.56 ns/op   12.61x
float64   min     256    24.51 ns/op    4.73x
float64   max     256    24.61 ns/op    3.71x
float64   add     256    25.81 ns/op    4.42x
float64   sub     256    25.84 ns/op    4.41x
float64   mul     256    25.66 ns/op    4.45x
float64   div     256   109.60 ns/op    1.97x
float64   sum    4096   220.98 ns/op   15.51x
float64   min    4096   229.76 ns/op    7.62x
float64   max    4096   227.68 ns/op    5.86x
float64   add    4096   707.07 ns/op    2.44x
float64   sub    4096   716.65 ns/op    2.45x
float64   mul    4096   699.45 ns/op    2.50x
float64   div    4096  1726.40 ns/op    2.01x
float64   sum   16384   930.43 ns/op   14.77x
float64   min   16384   930.08 ns/op    7.45x
float64   max   16384   938.31 ns/op    5.81x
float64   add   16384  5522.65 ns/op    1.45x
float64   sub   16384  5433.96 ns/op    1.55x
float64   mul   16384  5564.75 ns/op    1.51x
float64   div   16384  6913.10 ns/op    2.01x

   TYPE    OP    SIZE     RATE        SPEEDUP
  uint8   sum     100     5.94 ns/op    6.34x
  uint8   min     100     7.09 ns/op    7.31x
  uint8   max     100     7.82 ns/op    6.25x
  uint8   add     100     9.13 ns/op    5.76x
  uint8   sub     100     9.11 ns/op    5.73x
  uint8   mul     100    11.15 ns/op    6.37x
  uint8   div     100   186.89 ns/op    1.17x

   TYPE    OP    SIZE     RATE        SPEEDUP
 uint16   sum     256     6.13 ns/op   13.19x
 uint16   min     256    10.00 ns/op   15.94x
 uint16   max     256    11.11 ns/op    9.58x
 uint16   add     256    10.04 ns/op   16.83x
 uint16   sub     256    10.07 ns/op   16.77x
 uint16   mul     256    10.45 ns/op   16.26x
 uint16   div     256   438.34 ns/op    1.26x
 uint16   sum    4096    38.15 ns/op   32.42x
 uint16   min    4096    57.84 ns/op   44.50x
 uint16   max    4096    58.61 ns/op   29.17x
 uint16   add    4096    86.01 ns/op   30.29x
 uint16   sub    4096    91.34 ns/op   30.54x
 uint16   mul    4096    89.25 ns/op   29.65x
 uint16   div    4096  7203.14 ns/op    1.26x
 uint16   sum   16384   148.55 ns/op   34.66x
 uint16   min   16384   231.87 ns/op   44.76x
 uint16   max   16384   227.01 ns/op   31.32x
 uint16   add   16384   722.47 ns/op   14.60x
 uint16   sub   16384   737.93 ns/op   14.21x
 uint16   mul   16384   711.59 ns/op   14.84x
 uint16   div   16384 28084.66 ns/op    1.24x

   TYPE    OP    SIZE     RATE        SPEEDUP
 uint32   sum     256     8.06 ns/op   10.39x
 uint32   min     256    22.96 ns/op    6.96x
 uint32   max     256    26.21 ns/op    4.12x
 uint32   add     256    15.91 ns/op    8.18x
 uint32   sub     256    15.49 ns/op    7.98x
 uint32   mul     256    16.81 ns/op    7.40x
 uint32   div     256   436.08 ns/op    1.29x
 uint32   sum    4096    75.90 ns/op   16.47x
 uint32   min    4096   120.89 ns/op   21.54x
 uint32   max    4096   125.33 ns/op   13.76x
 uint32   add    4096   380.00 ns/op    5.16x
 uint32   sub    4096   345.19 ns/op    5.69x
 uint32   mul    4096   367.35 ns/op    4.91x
 uint32   div    4096  6695.04 ns/op    1.29x
 uint32   sum   16384   462.89 ns/op   10.85x
 uint32   min   16384   475.29 ns/op   21.98x
 uint32   max   16384   481.76 ns/op   14.43x
 uint32   add   16384  1437.51 ns/op    5.49x
 uint32   sub   16384  1427.22 ns/op    5.57x
 uint32   mul   16384  1367.51 ns/op    5.71x
 uint32   div   16384 26786.72 ns/op    1.30x

   TYPE    OP    SIZE     RATE        SPEEDUP
 uint64   sum     256    12.35 ns/op    9.18x
 uint64   min     256    41.42 ns/op    3.84x
 uint64   max     256    44.52 ns/op    2.55x
 uint64   add     256    24.86 ns/op    4.92x
 uint64   sub     256    25.35 ns/op    4.82x
 uint64   mul     256    50.52 ns/op    2.37x
 uint64   div     256  1250.57 ns/op    1.18x
 uint64   sum    4096   145.98 ns/op   11.78x
 uint64   min    4096   507.57 ns/op    5.11x
 uint64   max    4096   508.87 ns/op    3.42x
 uint64   add    4096   701.61 ns/op    2.79x
 uint64   sub    4096   719.04 ns/op    2.74x
 uint64   mul    4096   881.98 ns/op    2.18x
 uint64   div    4096 20114.02 ns/op    1.18x
 uint64   sum   16384   930.65 ns/op    7.44x
 uint64   min   16384  2059.33 ns/op    5.08x
 uint64   max   16384  2071.32 ns/op    3.34x
 uint64   add   16384  6001.34 ns/op    1.45x
 uint64   sub   16384  5713.91 ns/op    1.52x
 uint64   mul   16384  6147.43 ns/op    1.44x
 uint64   div   16384 80476.33 ns/op    1.17x

   TYPE    OP    SIZE     RATE        SPEEDUP
   int8   sum     100     5.89 ns/op    6.18x
   int8   min     100     7.51 ns/op    6.81x
   int8   max     100     7.46 ns/op    6.33x
   int8   add     100     9.10 ns/op    5.74x
   int8   sub     100     9.13 ns/op    5.75x
   int8   mul     100    10.97 ns/op    6.46x
   int8   div     100   480.25 ns/op    0.63x

   TYPE    OP    SIZE     RATE        SPEEDUP
  int16   sum     256     6.20 ns/op   13.04x
  int16   min     256    10.80 ns/op    9.77x
  int16   max     256    10.13 ns/op   10.53x
  int16   add     256     9.99 ns/op   16.97x
  int16   sub     256    10.00 ns/op   17.03x
  int16   mul     256    10.47 ns/op   16.24x
  int16   div     256   499.02 ns/op    1.53x
  int16   sum    4096    37.97 ns/op   32.87x
  int16   min    4096    59.55 ns/op   29.15x
  int16   max    4096    58.42 ns/op   29.54x
  int16   add    4096    85.73 ns/op   30.27x
  int16   sub    4096    86.22 ns/op   30.07x
  int16   mul    4096    86.61 ns/op   30.00x
  int16   div    4096  7936.80 ns/op    1.52x
  int16   sum   16384   144.40 ns/op   34.61x
  int16   min   16384   226.17 ns/op   30.59x
  int16   max   16384   222.39 ns/op   30.90x
  int16   add   16384   708.78 ns/op   14.55x
  int16   sub   16384   722.46 ns/op   14.33x
  int16   mul   16384   715.68 ns/op   14.49x
  int16   div   16384 31537.05 ns/op    1.52x

   TYPE    OP    SIZE     RATE        SPEEDUP
  int32   sum     256     8.04 ns/op   10.35x
  int32   min     256    22.58 ns/op    4.72x
  int32   max     256    22.80 ns/op    4.67x
  int32   add     256    14.89 ns/op    8.29x
  int32   sub     256    15.08 ns/op    8.20x
  int32   mul     256    15.86 ns/op    7.61x
  int32   div     256   490.90 ns/op    1.37x
  int32   sum    4096    76.31 ns/op   16.23x
  int32   min    4096   121.09 ns/op   14.37x
  int32   max    4096   120.26 ns/op   14.35x
  int32   add    4096   347.35 ns/op    5.64x
  int32   sub    4096   351.04 ns/op    5.58x
  int32   mul    4096   341.38 ns/op    5.34x
  int32   div    4096  7849.51 ns/op    1.37x
  int32   sum   16384   467.90 ns/op   10.59x
  int32   min   16384   486.13 ns/op   14.18x
  int32   max   16384   480.85 ns/op   14.48x
  int32   add   16384  1426.40 ns/op    5.53x
  int32   sub   16384  1421.44 ns/op    5.53x
  int32   mul   16384  1448.81 ns/op    5.41x
  int32   div   16384 31078.60 ns/op    1.38x

   TYPE    OP    SIZE     RATE        SPEEDUP
  int64   sum     256    12.13 ns/op    6.71x
  int64   min     256    30.63 ns/op    3.47x
  int64   max     256    31.69 ns/op    3.84x
  int64   add     256    25.72 ns/op    5.17x
  int64   sub     256    25.66 ns/op    5.45x
  int64   mul     256    51.14 ns/op    2.38x
  int64   div     256  1524.32 ns/op    1.18x
  int64   sum    4096   145.87 ns/op    8.59x
  int64   min    4096   327.28 ns/op    5.26x
  int64   max    4096   331.75 ns/op    5.24x
  int64   add    4096   696.99 ns/op    2.83x
  int64   sub    4096   711.28 ns/op    2.76x
  int64   mul    4096   890.62 ns/op    2.13x
  int64   div    4096 24268.84 ns/op    1.18x
  int64   sum   16384   933.71 ns/op    5.59x
  int64   min   16384  1415.62 ns/op    4.90x
  int64   max   16384  1419.97 ns/op    4.88x
  int64   add   16384  5541.14 ns/op    1.53x
  int64   sub   16384  5183.70 ns/op    1.62x
  int64   mul   16384  6164.29 ns/op    1.36x
  int64   div   16384 96899.48 ns/op    1.18x

Acknowledgements

This library was originally inspired by the work of Valery Carey & Adrian Witas in viant/vec package, but instead of hand-rolled assembly and intrinsics I opted for using auto-vectorization for maintainability reasons.

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AddFloat32s

func AddFloat32s(dst, input1, input2 []float32) []float32

AddFloat32s adds input1 to input2 and writes back the result into dst slice

func AddFloat64s

func AddFloat64s(dst, input1, input2 []float64) []float64

AddFloat64s adds input1 to input2 and writes back the result into dst slice

func AddInt16s

func AddInt16s(dst, input1, input2 []int16) []int16

AddInt16s adds input1 to input2 and writes back the result into dst slice

func AddInt32s

func AddInt32s(dst, input1, input2 []int32) []int32

AddInt32s adds input1 to input2 and writes back the result into dst slice

func AddInt64s

func AddInt64s(dst, input1, input2 []int64) []int64

AddInt64s adds input1 to input2 and writes back the result into dst slice

func AddInt8s

func AddInt8s(dst, input1, input2 []int8) []int8

AddInt8s adds input1 to input2 and writes back the result into dst slice

func AddUint16s

func AddUint16s(dst, input1, input2 []uint16) []uint16

AddUint16s adds input1 to input2 and writes back the result into dst slice

func AddUint32s

func AddUint32s(dst, input1, input2 []uint32) []uint32

AddUint32s adds input1 to input2 and writes back the result into dst slice

func AddUint64s

func AddUint64s(dst, input1, input2 []uint64) []uint64

AddUint64s adds input1 to input2 and writes back the result into dst slice

func AddUint8s

func AddUint8s(dst, input1, input2 []uint8) []uint8

AddUint8s adds input1 to input2 and writes back the result into dst slice

func DivFloat32s

func DivFloat32s(dst, input1, input2 []float32) []float32

DivFloat32s divides input1 by input2 and writes back the result into dst slice

func DivFloat64s

func DivFloat64s(dst, input1, input2 []float64) []float64

DivFloat64s divides input1 by input2 and writes back the result into dst slice

func DivInt16s

func DivInt16s(dst, input1, input2 []int16) []int16

DivInt16s divides input1 by input2 and writes back the result into dst slice

func DivInt32s

func DivInt32s(dst, input1, input2 []int32) []int32

DivInt32s divides input1 by input2 and writes back the result into dst slice

func DivInt64s

func DivInt64s(dst, input1, input2 []int64) []int64

DivInt64s divides input1 by input2 and writes back the result into dst slice

func DivInt8s

func DivInt8s(dst, input1, input2 []int8) []int8

DivInt8s divides input1 by input2 and writes back the result into dst slice

func DivUint16s

func DivUint16s(dst, input1, input2 []uint16) []uint16

DivUint16s divides input1 by input2 and writes back the result into dst slice

func DivUint32s

func DivUint32s(dst, input1, input2 []uint32) []uint32

DivUint32s divides input1 by input2 and writes back the result into dst slice

func DivUint64s

func DivUint64s(dst, input1, input2 []uint64) []uint64

DivUint64s divides input1 by input2 and writes back the result into dst slice

func DivUint8s

func DivUint8s(dst, input1, input2 []uint8) []uint8

DivUint8s divides input1 by input2 and writes back the result into dst slice

func Max added in v1.1.0

func Max[T Number](input []T) T

Max returns the largest element value in the slice

func MaxFloat32s

func MaxFloat32s(input []float32) (out float32)

MaxFloat32s returns the largest element value in the slice

func MaxFloat64s

func MaxFloat64s(input []float64) (out float64)

MaxFloat64s returns the largest element value in the slice

func MaxInt16s

func MaxInt16s(input []int16) (out int16)

MaxInt16s returns the largest element value in the slice

func MaxInt32s

func MaxInt32s(input []int32) (out int32)

MaxInt32s returns the largest element value in the slice

func MaxInt64s

func MaxInt64s(input []int64) (out int64)

MaxInt64s returns the largest element value in the slice

func MaxInt8s

func MaxInt8s(input []int8) (out int8)

MaxInt8s returns the largest element value in the slice

func MaxUint16s

func MaxUint16s(input []uint16) (out uint16)

MaxUint16s returns the largest element value in the slice

func MaxUint32s

func MaxUint32s(input []uint32) (out uint32)

MaxUint32s returns the largest element value in the slice

func MaxUint64s

func MaxUint64s(input []uint64) (out uint64)

MaxUint64s returns the largest element value in the slice

func MaxUint8s

func MaxUint8s(input []uint8) (out uint8)

MaxUint8s returns the largest element value in the slice

func Min added in v1.1.0

func Min[T Number](input []T) T

Min returns the smallest element value in the slice

func MinFloat32s

func MinFloat32s(input []float32) (out float32)

MinFloat32s returns the smallest element value in the slice

func MinFloat64s

func MinFloat64s(input []float64) (out float64)

MinFloat64s returns the smallest element value in the slice

func MinInt16s

func MinInt16s(input []int16) (out int16)

MinInt16s returns the smallest element value in the slice

func MinInt32s

func MinInt32s(input []int32) (out int32)

MinInt32s returns the smallest element value in the slice

func MinInt64s

func MinInt64s(input []int64) (out int64)

MinInt64s returns the smallest element value in the slice

func MinInt8s

func MinInt8s(input []int8) (out int8)

MinInt8s returns the smallest element value in the slice

func MinUint16s

func MinUint16s(input []uint16) (out uint16)

MinUint16s returns the smallest element value in the slice

func MinUint32s

func MinUint32s(input []uint32) (out uint32)

MinUint32s returns the smallest element value in the slice

func MinUint64s

func MinUint64s(input []uint64) (out uint64)

MinUint64s returns the smallest element value in the slice

func MinUint8s

func MinUint8s(input []uint8) (out uint8)

MinUint8s returns the smallest element value in the slice

func MulFloat32s

func MulFloat32s(dst, input1, input2 []float32) []float32

MulFloat32s multiplies input1 by input2 and writes back the result into dst slice

func MulFloat64s

func MulFloat64s(dst, input1, input2 []float64) []float64

MulFloat64s multiplies input1 by input2 and writes back the result into dst slice

func MulInt16s

func MulInt16s(dst, input1, input2 []int16) []int16

MulInt16s multiplies input1 by input2 and writes back the result into dst slice

func MulInt32s

func MulInt32s(dst, input1, input2 []int32) []int32

MulInt32s multiplies input1 by input2 and writes back the result into dst slice

func MulInt64s

func MulInt64s(dst, input1, input2 []int64) []int64

MulInt64s multiplies input1 by input2 and writes back the result into dst slice

func MulInt8s

func MulInt8s(dst, input1, input2 []int8) []int8

MulInt8s multiplies input1 by input2 and writes back the result into dst slice

func MulUint16s

func MulUint16s(dst, input1, input2 []uint16) []uint16

MulUint16s multiplies input1 by input2 and writes back the result into dst slice

func MulUint32s

func MulUint32s(dst, input1, input2 []uint32) []uint32

MulUint32s multiplies input1 by input2 and writes back the result into dst slice

func MulUint64s

func MulUint64s(dst, input1, input2 []uint64) []uint64

MulUint64s multiplies input1 by input2 and writes back the result into dst slice

func MulUint8s

func MulUint8s(dst, input1, input2 []uint8) []uint8

MulUint8s multiplies input1 by input2 and writes back the result into dst slice

func SubFloat32s

func SubFloat32s(dst, input1, input2 []float32) []float32

SubFloat32s subtracts input2 from input1 and writes back the result into dst slice

func SubFloat64s

func SubFloat64s(dst, input1, input2 []float64) []float64

SubFloat64s subtracts input2 from input1 and writes back the result into dst slice

func SubInt16s

func SubInt16s(dst, input1, input2 []int16) []int16

SubInt16s subtracts input2 from input1 and writes back the result into dst slice

func SubInt32s

func SubInt32s(dst, input1, input2 []int32) []int32

SubInt32s subtracts input2 from input1 and writes back the result into dst slice

func SubInt64s

func SubInt64s(dst, input1, input2 []int64) []int64

SubInt64s subtracts input2 from input1 and writes back the result into dst slice

func SubInt8s

func SubInt8s(dst, input1, input2 []int8) []int8

SubInt8s subtracts input2 from input1 and writes back the result into dst slice

func SubUint16s

func SubUint16s(dst, input1, input2 []uint16) []uint16

SubUint16s subtracts input2 from input1 and writes back the result into dst slice

func SubUint32s

func SubUint32s(dst, input1, input2 []uint32) []uint32

SubUint32s subtracts input2 from input1 and writes back the result into dst slice

func SubUint64s

func SubUint64s(dst, input1, input2 []uint64) []uint64

SubUint64s subtracts input2 from input1 and writes back the result into dst slice

func SubUint8s

func SubUint8s(dst, input1, input2 []uint8) []uint8

SubUint8s subtracts input2 from input1 and writes back the result into dst slice

func Sum added in v1.1.0

func Sum[T Number](input []T) T

Sum sums up all of the elements of the slice and returns the value

func SumFloat32s

func SumFloat32s(input []float32) (out float32)

SumFloat32s sums up all of the elements of the slice and returns the value

func SumFloat64s

func SumFloat64s(input []float64) (out float64)

SumFloat64s sums up all of the elements of the slice and returns the value

func SumInt16s

func SumInt16s(input []int16) (out int16)

SumInt16s sums up all of the elements of the slice and returns the value

func SumInt32s

func SumInt32s(input []int32) (out int32)

SumInt32s sums up all of the elements of the slice and returns the value

func SumInt64s

func SumInt64s(input []int64) (out int64)

SumInt64s sums up all of the elements of the slice and returns the value

func SumInt8s

func SumInt8s(input []int8) (out int8)

SumInt8s sums up all of the elements of the slice and returns the value

func SumUint16s

func SumUint16s(input []uint16) (out uint16)

SumUint16s sums up all of the elements of the slice and returns the value

func SumUint32s

func SumUint32s(input []uint32) (out uint32)

SumUint32s sums up all of the elements of the slice and returns the value

func SumUint64s

func SumUint64s(input []uint64) (out uint64)

SumUint64s sums up all of the elements of the slice and returns the value

func SumUint8s

func SumUint8s(input []uint8) (out uint8)

SumUint8s sums up all of the elements of the slice and returns the value

Types

type Number added in v1.1.2

type Number interface {
	~int | ~int8 | ~int16 | ~int32 | ~int64 | uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~float32 | ~float64
}

Number represents a number constraint for SIMD operations

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL