Documentation
¶
Index ¶
- Constants
- func BigEndianToDecimal(buf []byte) (decimal.Decimal128, error)
- func MakeSequencedChan[T any](bufferSize uint, source <-chan T, comesAfter, isNext func(a, b *T) bool, ...) <-chan T
- func MapExec[T, S any](nWorkers int, slice iter.Seq[T], fn func(T) (S, error)) iter.Seq2[S, error]
- func PartitionRecordValue(field iceberg.PartitionField, val iceberg.Literal, schema *iceberg.Schema) (iceberg.Optional[iceberg.Literal], error)
- func TruncateUpperBoundBinary(val []byte, trunc int) []byte
- func TruncateUpperBoundText(s string, trunc int) string
- type DataFileStatistics
- type Enumerated
- type FileFormat
- type FileReader
- type FileSource
- type Metadata
- type MetricModeType
- type MetricsMode
- type ParquetFileSource
- type StatisticsCollector
- type StatsAgg
- type TypedStats
- type WriteFileInfo
Constants ¶
View Source
const ( ParquetRowGroupSizeBytesKey = "write.parquet.row-group-size-bytes" ParquetRowGroupSizeBytesDefault = 128 * 1024 * 1024 // 128 MB ParquetRowGroupLimitKey = "write.parquet.row-group-limit" ParquetRowGroupLimitDefault = 1048576 ParquetPageSizeBytesKey = "write.parquet.page-size-bytes" ParquetPageSizeBytesDefault = 1024 * 1024 // 1 MB ParquetPageRowLimitKey = "write.parquet.page-row-limit" ParquetPageRowLimitDefault = 20000 ParquetDictSizeBytesKey = "write.parquet.dict-size-bytes" ParquetDictSizeBytesDefault = 2 * 1024 * 1024 // 2 MB ParquetCompressionKey = "write.parquet.compression-codec" ParquetCompressionDefault = "zstd" ParquetCompressionLevelKey = "write.parquet.compression-level" ParquetCompressionLevelDefault = -1 ParquetBloomFilterMaxBytesKey = "write.parquet.bloom-filter-max-bytes" ParquetBloomFilterMaxBytesDefault = 1024 * 1024 ParquetBloomFilterColumnEnabledKeyPrefix = "write.parquet.bloom-filter-enabled.column" )
Variables ¶
This section is empty.
Functions ¶
func BigEndianToDecimal ¶
func BigEndianToDecimal(buf []byte) (decimal.Decimal128, error)
func MakeSequencedChan ¶
func MakeSequencedChan[T any](bufferSize uint, source <-chan T, comesAfter, isNext func(a, b *T) bool, initial T) <-chan T
MakeSequencedChan creates a channel that outputs values in a given order based on the comesAfter and isNext functions. The values are read in from the provided source and then re-ordered before being sent to the output.
func PartitionRecordValue ¶
func TruncateUpperBoundText ¶
Types ¶
type DataFileStatistics ¶
type DataFileStatistics struct {
RecordCount int64
ColSizes map[int]int64
ValueCounts map[int]int64
NullValueCounts map[int]int64
NanValueCounts map[int]int64
ColAggs map[int]StatsAgg
SplitOffsets []int64
}
func (*DataFileStatistics) PartitionValue ¶
func (d *DataFileStatistics) PartitionValue(field iceberg.PartitionField, sc *iceberg.Schema) any
func (*DataFileStatistics) ToDataFile ¶
func (d *DataFileStatistics) ToDataFile(schema *iceberg.Schema, spec iceberg.PartitionSpec, path string, format iceberg.FileFormat, content iceberg.ManifestEntryContent, filesize int64, partitionValues map[int]any) iceberg.DataFile
type Enumerated ¶
Enumerated is a quick way to represent a sequenced value that can be processed in parallel and then needs to be reordered.
type FileFormat ¶
type FileFormat interface {
Open(context.Context, iceio.IO, string) (FileReader, error)
PathToIDMapping(*iceberg.Schema) (map[string]int, error)
DataFileStatsFromMeta(rdr Metadata, statsCols map[int]StatisticsCollector, colMapping map[string]int) *DataFileStatistics
GetWriteProperties(iceberg.Properties) any
WriteDataFile(ctx context.Context, fs iceio.WriteFileIO, partitionValues map[int]any, info WriteFileInfo, batches []arrow.RecordBatch) (iceberg.DataFile, error)
}
func FormatFromFileName ¶
func FormatFromFileName(fileName string) FileFormat
func GetFileFormat ¶
func GetFileFormat(format iceberg.FileFormat) FileFormat
type FileReader ¶
type FileReader interface {
io.Closer
Metadata() Metadata
SourceFileSize() int64
Schema() (*arrow.Schema, error)
// PrunedSchema takes in the list of projected field IDs and returns the arrow schema
// that represents the underlying file schema with only the projected fields. It also
// returns the indexes of the projected columns to allow reading *only* the needed
// columns.
PrunedSchema(projectedIDs map[int]struct{}, mapping iceberg.NameMapping) (*arrow.Schema, []int, error)
// GetRecords returns a record reader for only the provided columns (using nil will read
// all of the columns of the underlying file.) The `tester` is a function that can be used,
// if non-nil, to filter aspects of the file such as skipping row groups in a parquet file.
GetRecords(ctx context.Context, cols []int, tester any) (array.RecordReader, error)
// ReadTable reads the entire file and returns it as an arrow table.
ReadTable(context.Context) (arrow.Table, error)
}
type FileSource ¶
type FileSource interface {
GetReader(context.Context) (FileReader, error)
}
func GetFile ¶
func GetFile(ctx context.Context, fs iceio.IO, dataFile iceberg.DataFile, isPosDeletes bool) (FileSource, error)
GetFile opens the given file using the provided file system.
The FileSource interface allows abstracting away the underlying file format while providing utilties to read the file as Arrow record batches.
type MetricModeType ¶
type MetricModeType string
const ( MetricModeTruncate MetricModeType = "truncate" MetricModeNone MetricModeType = "none" MetricModeCounts MetricModeType = "counts" MetricModeFull MetricModeType = "full" )
type MetricsMode ¶
type MetricsMode struct {
Typ MetricModeType
Len int
}
func MatchMetricsMode ¶
func MatchMetricsMode(mode string) (MetricsMode, error)
type ParquetFileSource ¶
type ParquetFileSource struct {
// contains filtered or unexported fields
}
func (*ParquetFileSource) GetReader ¶
func (pfs *ParquetFileSource) GetReader(ctx context.Context) (FileReader, error)
type StatisticsCollector ¶
type StatisticsCollector struct {
FieldID int
IcebergTyp iceberg.PrimitiveType
Mode MetricsMode
ColName string
}
type TypedStats ¶
type TypedStats[T iceberg.LiteralType] interface { Min() T Max() T }
type WriteFileInfo ¶
type WriteFileInfo struct {
FileSchema *iceberg.Schema
Spec iceberg.PartitionSpec
FileName string
StatsCols map[int]StatisticsCollector
WriteProps any
Content iceberg.ManifestEntryContent
}
Click to show internal directories.
Click to hide internal directories.