parquet

package module

v0.0.0-...-9663103 Latest Latest Go to latest Published: May 6, 2022 License: Apache-2.0 Imports: 15 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/viant/parquet

Links

Open Source Insights

README ¶

Parquet

Work in progress.

Motivation

The existing go parquet implementation suffer from excessive memory usage, or poor performance. For our use case go struct to parquet conversion has to come with both small memory footprint, good compression and fast performance. In addition, conversion has to support omit empty style setting, where empty string, zero or false shall not produce any value. This is especially important when ingesting data to BigQuery.

This library has been forked from Parsyl and modified to meet our goals.

Usage

Documentation ¶

Overview ¶

Package parquet is not intended to be used as a general library. The code generated by the 'parquetgen' command is what actually uses it for reading and writing parquet files.

Index ¶

func BoolType(se *sch.SchemaElement)
func ConvertedTypeDate(se *sch.SchemaElement)
func ConvertedTypeEmpty(se *sch.SchemaElement)
func ConvertedTypeTimestampMillis(se *sch.SchemaElement)
func ConvertedTypeUTF8(se *sch.SchemaElement)
func DateToString(date int32) string
func Float32Type(se *sch.SchemaElement)
func Float64Type(se *sch.SchemaElement)
func GetBools(r io.Reader, n int, pageSizes []int) ([]bool, error)
func Int32Type(se *sch.SchemaElement)
func Int64Type(se *sch.SchemaElement)
func LogicalTypeDate(se *sch.SchemaElement)
func LogicalTypeString(se *sch.SchemaElement)
func LogicalTypeTimestampMillis(se *sch.SchemaElement)
func OptionalFieldGZIP(r *OptionalField)
func OptionalFieldSnappy(r *OptionalField)
func OptionalFieldUncompressed(o *OptionalField)
func OptionalSchemaOption(opts ...SchemeOption) func(f *OptionalField)
func PageHeader(r io.Reader) (*sch.PageHeader, error)
func PageHeaders(footer *sch.FileMetaData, r io.ReadSeeker) ([]sch.PageHeader, error)
func PageHeadersAtOffset(r io.ReadSeeker, o, n int64) ([]sch.PageHeader, error)
func ReadMetaData(r io.ReadSeeker) (*sch.FileMetaData, error)
func RepetitionOptional(se *sch.SchemaElement)
func RepetitionRepeated(se *sch.SchemaElement)
func RepetitionRequired(se *sch.SchemaElement)
func RequiredFieldGZIP(r *RequiredField)
func RequiredFieldSnappy(r *RequiredField)
func RequiredFieldUncompressed(r *RequiredField)
func SchemaOption(opts ...SchemeOption) func(f *RequiredField)
func StringToDate(ts string) int32
func StringToTime(ts string) *time.Time
func StringType(se *sch.SchemaElement)
func TimeToString(time time.Time) string
func Uint32Type(se *sch.SchemaElement)
func Uint64Type(se *sch.SchemaElement)
type Field
type MaxLevel
type Metadata
- func New(fields ...Field) *Metadata
- func (m *Metadata) Footer(w io.Writer) error
- func (m *Metadata) NextDoc()
- func (m *Metadata) Pages() (map[string][]Page, error)
- func (m *Metadata) ReadFooter(r io.ReadSeeker) error
- func (m *Metadata) RowGroups() []RowGroup
- func (m *Metadata) Rows() int64
- func (m *Metadata) StartRowGroup(fields ...Field)
- func (m *Metadata) WritePageHeader(w io.Writer, pth []string, dataLen, compressedLen, defCount, count int, ...) error
type OptionalField
- func NewOptionalField(pth []string, types []int, opts ...func(*OptionalField)) OptionalField
- func (f *OptionalField) DoRead(r io.ReadSeeker, pg Page) (io.Reader, []int, error)
- func (f *OptionalField) DoWrite(w io.Writer, meta *Metadata, vals []byte, count int, stats Stats) error
- func (f *OptionalField) Name() string
- func (f *OptionalField) Options() []SchemeOption
- func (f *OptionalField) Path() []string
- func (f *OptionalField) Values() int
type Page
type RepetitionType
type RepetitionTypes
- func (r RepetitionTypes) MaxDef() uint8
- func (r RepetitionTypes) MaxRep() uint8
- func (r RepetitionTypes) Optional() bool
- func (r RepetitionTypes) Repeated() bool
- func (r RepetitionTypes) Required() bool
type RequiredField
- func NewRequiredField(pth []string, opts ...func(*RequiredField)) RequiredField
- func (f *RequiredField) DoRead(r io.ReadSeeker, pg Page) (io.Reader, []int, error)
- func (f *RequiredField) DoWrite(w io.Writer, meta *Metadata, vals []byte, count int, stats Stats) error
- func (f *RequiredField) Name() string
- func (f *RequiredField) Options() []SchemeOption
- func (f *RequiredField) Path() []string
type RowGroup
- func (r *RowGroup) Columns() []*sch.ColumnChunk
type SchemeOption
type Stats

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func BoolType ¶

func BoolType(se *sch.SchemaElement)

func ConvertedTypeDate ¶

func ConvertedTypeDate(se *sch.SchemaElement)

func ConvertedTypeEmpty ¶

func ConvertedTypeEmpty(se *sch.SchemaElement)

func ConvertedTypeTimestampMillis ¶

func ConvertedTypeTimestampMillis(se *sch.SchemaElement)

func ConvertedTypeUTF8 ¶

func ConvertedTypeUTF8(se *sch.SchemaElement)

func DateToString ¶

func DateToString(date int32) string

func Float32Type ¶

func Float32Type(se *sch.SchemaElement)

func Float64Type ¶

func Float64Type(se *sch.SchemaElement)

func GetBools ¶

func GetBools(r io.Reader, n int, pageSizes []int) ([]bool, error)

GetBools reads a byte array and turns each bit into a bool

func Int32Type ¶

func Int32Type(se *sch.SchemaElement)

func Int64Type ¶

func Int64Type(se *sch.SchemaElement)

func LogicalTypeDate ¶

func LogicalTypeDate(se *sch.SchemaElement)

func LogicalTypeString ¶

func LogicalTypeString(se *sch.SchemaElement)

func LogicalTypeTimestampMillis ¶

func LogicalTypeTimestampMillis(se *sch.SchemaElement)

func OptionalFieldGZIP ¶

func OptionalFieldGZIP(r *OptionalField)

OptionalFieldSnappy sets the compression for a column to snappy It is an optional arg to NewOptionalField

func OptionalFieldSnappy ¶

func OptionalFieldSnappy(r *OptionalField)

OptionalFieldSnappy sets the compression for a column to snappy It is an optional arg to NewOptionalField

func OptionalFieldUncompressed ¶

func OptionalFieldUncompressed(o *OptionalField)

OptionalFieldUncompressed sets the compression to none It is an optional arg to NewOptionalField

func OptionalSchemaOption ¶

func OptionalSchemaOption(opts ...SchemeOption) func(f *OptionalField)

func PageHeader ¶

func PageHeader(r io.Reader) (*sch.PageHeader, error)

PageHeader reads the page header from a column page

func PageHeaders ¶

func PageHeaders(footer *sch.FileMetaData, r io.ReadSeeker) ([]sch.PageHeader, error)

PageHeaders reads all the page headers without reading the actual data. It is used by parquetgen to print the page headers.

func PageHeadersAtOffset ¶

func PageHeadersAtOffset(r io.ReadSeeker, o, n int64) ([]sch.PageHeader, error)

PageHeadersAtOffset seeks to the given offset, then reads the PageHeader without reading the data.

func ReadMetaData ¶

func ReadMetaData(r io.ReadSeeker) (*sch.FileMetaData, error)

ReadMetaData reads the FileMetaData from the end of a parquet file

func RepetitionOptional ¶

func RepetitionOptional(se *sch.SchemaElement)

RepetitionOptional sets the repetition type to optional

func RepetitionRepeated ¶

func RepetitionRepeated(se *sch.SchemaElement)

RepetitionRepeated sets the repetition type to repeated

func RepetitionRequired ¶

func RepetitionRequired(se *sch.SchemaElement)

RepetitionRequired sets the repetition type to required

func RequiredFieldGZIP ¶

func RequiredFieldGZIP(r *RequiredField)

RequiredFieldSnappy sets the compression for a column to snappy It is an optional arg to NewRequiredField

func RequiredFieldSnappy ¶

func RequiredFieldSnappy(r *RequiredField)

RequiredFieldSnappy sets the compression for a column to snappy It is an optional arg to NewRequiredField

func RequiredFieldUncompressed ¶

func RequiredFieldUncompressed(r *RequiredField)

RequiredFieldUncompressed sets the compression to none It is an optional arg to NewRequiredField

func SchemaOption ¶

func SchemaOption(opts ...SchemeOption) func(f *RequiredField)

func StringToDate ¶

func StringToDate(ts string) int32

func StringToTime ¶

func StringToTime(ts string) *time.Time

func StringType ¶

func StringType(se *sch.SchemaElement)

func TimeToString ¶

func TimeToString(time time.Time) string

func Uint32Type ¶

func Uint32Type(se *sch.SchemaElement)

func Uint64Type ¶

func Uint64Type(se *sch.SchemaElement)

Types ¶

type Field ¶

type Field struct {
	Name    string
	Path    []string
	Types   []int
	Options []SchemeOption
}

Field holds the type information for a parquet column

type MaxLevel ¶

type MaxLevel struct {
	Def uint8
	Rep uint8
}

MaxLevel holds the maximum definition and repeptition level for a given field.

type Metadata ¶

type Metadata struct {
	// contains filtered or unexported fields
}

Metadata keeps track of the things that need to be kept track of in order to write the FileMetaData at the end of the parquet file.

func New ¶

func New(fields ...Field) *Metadata

New returns a Metadata struct and reads the first row group into memory.

func (m *Metadata) Footer(w io.Writer) error

Footer writes the FileMetaData at the end of the file.

func (*Metadata) NextDoc ¶

func (m *Metadata) NextDoc()

NextDoc keeps track of how many documents have been added to this parquet file. The final value of m.docs is used for the FileMetaData.NumRows

func (*Metadata) Pages ¶

func (m *Metadata) Pages() (map[string][]Page, error)

Pages maps each column name to its Pages

func (*Metadata) ReadFooter ¶

func (m *Metadata) ReadFooter(r io.ReadSeeker) error

ReadFooter reads the parquet metadata

func (*Metadata) RowGroups ¶

func (m *Metadata) RowGroups() []RowGroup

RowGroups returns a summary of each schema.RowGroup

func (*Metadata) Rows ¶

func (m *Metadata) Rows() int64

Rows return the total number of rows that are being written in to a parquet file.

func (*Metadata) StartRowGroup ¶

func (m *Metadata) StartRowGroup(fields ...Field)

StartRowGroup is called when starting a new row group

func (*Metadata) WritePageHeader ¶

func (m *Metadata) WritePageHeader(w io.Writer, pth []string, dataLen, compressedLen, defCount, count int, defLen, repLen int64, comp sch.CompressionCodec, stats Stats) error

WritePageHeader is called in order to finish writing to a column chunk.

type OptionalField ¶

type OptionalField struct {
	Defs []uint8
	Reps []uint8

	MaxLevels MaxLevel

	RepetitionType SchemeOption
	Types          []int
	// contains filtered or unexported fields
}

OptionalField is any exported field in a struct that is a pointer.

func NewOptionalField ¶

func NewOptionalField(pth []string, types []int, opts ...func(*OptionalField)) OptionalField

NewOptionalField creates an optional field

func (*OptionalField) DoRead ¶

func (f *OptionalField) DoRead(r io.ReadSeeker, pg Page) (io.Reader, []int, error)

DoRead is called by all optional fields. It reads the definition levels and uses them to interpret the raw data.

func (*OptionalField) DoWrite ¶

func (f *OptionalField) DoWrite(w io.Writer, meta *Metadata, vals []byte, count int, stats Stats) error

DoWrite is called by all optional field types to write the definition levels and raw data to the io.Writer

func (*OptionalField) Name ¶

func (f *OptionalField) Name() string

Name returns the column name of this field

func (*OptionalField) Options ¶

func (f *OptionalField) Options() []SchemeOption

func (*OptionalField) Path ¶

func (f *OptionalField) Path() []string

Path returns the path of this field

func (*OptionalField) Values ¶

func (f *OptionalField) Values() int

Values reads the definition levels and uses them to return the values from the page data.

type Page ¶

type Page struct {
	// N is the number of values in the ColumnChunk
	N      int
	Size   int
	Offset int64
	Codec  sch.CompressionCodec
}

Page keeps track of metadata for each ColumnChunk

type RepetitionType ¶

type RepetitionType int

RepetitionType is an enum of the possible parquet repetition types

const (
	Required RepetitionType = 0
	Optional RepetitionType = 1
	Repeated RepetitionType = 2
)

type RepetitionTypes ¶

type RepetitionTypes []RepetitionType

RepetitionTypes provides several functions used by parquetgen's go templates to generate code.

func (RepetitionTypes) MaxDef ¶

func (r RepetitionTypes) MaxDef() uint8

MaxDef returns the largest definition level

func (RepetitionTypes) MaxRep ¶

func (r RepetitionTypes) MaxRep() uint8

MaxRep returns the largest repetition level

func (RepetitionTypes) Optional ¶

func (r RepetitionTypes) Optional() bool

Optional figures out if there is an optional field

func (RepetitionTypes) Repeated ¶

func (r RepetitionTypes) Repeated() bool

Repeated figures out if there is a repeated field

func (RepetitionTypes) Required ¶

func (r RepetitionTypes) Required() bool

Required figures out if there are no optional or repeated fields

type RequiredField ¶

type RequiredField struct {
	// contains filtered or unexported fields
}

RequiredField writes the raw data for required columns

func NewRequiredField ¶

func NewRequiredField(pth []string, opts ...func(*RequiredField)) RequiredField

NewRequiredField creates a required field.

func (*RequiredField) DoRead ¶

func (f *RequiredField) DoRead(r io.ReadSeeker, pg Page) (io.Reader, []int, error)

DoRead reads the actual raw data.

func (*RequiredField) DoWrite ¶

func (f *RequiredField) DoWrite(w io.Writer, meta *Metadata, vals []byte, count int, stats Stats) error

DoWrite writes the actual raw data.

func (*RequiredField) Name ¶

func (f *RequiredField) Name() string