use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class ColumnWriterUtils method buildRowGroupIndexes.
/**
* Build RowGroupIndex using column statistics and checkpoints.
*/
@SafeVarargs
public static List<RowGroupIndex> buildRowGroupIndexes(boolean compressed, List<ColumnStatistics> rowGroupColumnStatistics, Optional<List<? extends StreamCheckpoint>> prependCheckpoints, PresentOutputStream presentStream, ValueOutputStream<? extends StreamCheckpoint>... dataStreams) {
ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
List<List<? extends StreamCheckpoint>> dataCheckpoints = Arrays.stream(dataStreams).map(ValueOutputStream::getCheckpoints).collect(Collectors.toList());
for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
int groupId = i;
Optional<StreamCheckpoint> prependCheckpoint = prependCheckpoints.map(checkpoints -> checkpoints.get(groupId));
Optional<StreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
// prepend and present checkpoints always come first
ImmutableList.Builder<Integer> positions = ImmutableList.builder();
prependCheckpoint.ifPresent(checkpoint -> positions.addAll(checkpoint.toPositionList(compressed)));
presentCheckpoint.ifPresent(checkpoint -> positions.addAll(checkpoint.toPositionList(compressed)));
// add data checkpoints
for (List<? extends StreamCheckpoint> dataCheckpoint : dataCheckpoints) {
StreamCheckpoint streamCheckpoint = dataCheckpoint.get(groupId);
positions.addAll(streamCheckpoint.toPositionList(compressed));
}
ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
rowGroupIndexes.add(new RowGroupIndex(positions.build(), columnStatistics));
}
return rowGroupIndexes.build();
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class ColumnWriters method createColumnWriter.
/**
* Creates a column writer for a given type.
*
* @param nodeIndex - index of the node in the orcTypes
*/
public static ColumnWriter createColumnWriter(int nodeIndex, int sequence, List<OrcType> orcTypes, Type type, ColumnWriterOptions columnWriterOptions, OrcEncoding orcEncoding, DateTimeZone hiveStorageTimeZone, DwrfEncryptionInfo dwrfEncryptors, MetadataWriter metadataWriter) {
requireNonNull(type, "type is null");
OrcType orcType = orcTypes.get(nodeIndex);
Optional<DwrfDataEncryptor> dwrfEncryptor = dwrfEncryptors.getEncryptorByNodeId(nodeIndex);
switch(orcType.getOrcTypeKind()) {
case BOOLEAN:
return new BooleanColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, metadataWriter);
case FLOAT:
return new FloatColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, metadataWriter);
case DOUBLE:
return new DoubleColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, metadataWriter);
case BYTE:
return new ByteColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, metadataWriter);
case DATE:
checkArgument(orcEncoding != DWRF, "DWRF does not support %s type", type);
return new LongColumnWriter(nodeIndex, DEFAULT_SEQUENCE_ID, type, columnWriterOptions, dwrfEncryptor, orcEncoding, DateStatisticsBuilder::new, metadataWriter);
case SHORT:
return new LongColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, IntegerStatisticsBuilder::new, metadataWriter);
case INT:
case LONG:
if (columnWriterOptions.isIntegerDictionaryEncodingEnabled() && orcEncoding == DWRF) {
// ORC V1 does not support Integer Dictionary encoding. DWRF supports Integer dictionary encoding.
return new LongDictionaryColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, metadataWriter);
}
return new LongColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, IntegerStatisticsBuilder::new, metadataWriter);
case DECIMAL:
checkArgument(orcEncoding != DWRF, "DWRF does not support %s type", type);
return new DecimalColumnWriter(nodeIndex, type, columnWriterOptions, orcEncoding, metadataWriter);
case TIMESTAMP:
case TIMESTAMP_MICROSECONDS:
return new TimestampColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, hiveStorageTimeZone, metadataWriter);
case BINARY:
return new SliceDirectColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, BinaryStatisticsBuilder::new, metadataWriter);
case CHAR:
checkArgument(orcEncoding != DWRF, "DWRF does not support %s type", type);
// fall through
case VARCHAR:
case STRING:
if (columnWriterOptions.isStringDictionaryEncodingEnabled()) {
return new SliceDictionaryColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, metadataWriter);
}
int stringStatisticsLimit = columnWriterOptions.getStringStatisticsLimit();
return new SliceDirectColumnWriter(nodeIndex, sequence, type, columnWriterOptions, dwrfEncryptor, orcEncoding, () -> new StringStatisticsBuilder(stringStatisticsLimit), metadataWriter);
case LIST:
{
Type fieldType = type.getTypeParameters().get(0);
ColumnWriter elementWriter = createColumnWriter(orcType.getFieldTypeIndex(0), sequence, orcTypes, fieldType, columnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter);
return new ListColumnWriter(nodeIndex, sequence, columnWriterOptions, dwrfEncryptor, orcEncoding, elementWriter, metadataWriter);
}
case MAP:
{
if (columnWriterOptions.getFlattenedNodes().contains(nodeIndex)) {
checkArgument(orcEncoding == DWRF, "%s does not support flat maps", orcEncoding);
Type valueType = type.getTypeParameters().get(1);
OrcType keyOrcType = orcTypes.get(orcType.getFieldTypeIndex(0));
Supplier<StatisticsBuilder> keyStatisticsBuilderSupplier = createStatisticsBuilderSupplier(keyOrcType, columnWriterOptions);
// value writers should not create their own expensive dictionaries, instead they should use shared dictionaries
ColumnWriterOptions valueWriterColumnWriterOptions = columnWriterOptions.copyWithDisabledDictionaryEncoding();
// Flat map writer needs to provide column statistics for the value node(s) even if there are no values.
// This lambda will provide empty column statistics right away instead of creating an expensive empty
// value writer and getting empty stats from it.
int valueNodeIndex = orcType.getFieldTypeIndex(1);
Supplier<Map<Integer, ColumnStatistics>> emptyColumnStatisticsSupplier = () -> createEmptyColumnStatistics(orcTypes, valueNodeIndex, columnWriterOptions);
IntFunction<ColumnWriter> valueWriterSupplier = (valueSequence) -> createColumnWriter(valueNodeIndex, valueSequence, orcTypes, valueType, valueWriterColumnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter);
return new MapFlatColumnWriter(nodeIndex, orcType.getFieldTypeIndex(0), valueNodeIndex, type.getTypeParameters().get(0), valueType, keyStatisticsBuilderSupplier, columnWriterOptions, dwrfEncryptor, metadataWriter, valueWriterSupplier, emptyColumnStatisticsSupplier);
}
ColumnWriter keyWriter = createColumnWriter(orcType.getFieldTypeIndex(0), sequence, orcTypes, type.getTypeParameters().get(0), columnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter);
ColumnWriter valueWriter = createColumnWriter(orcType.getFieldTypeIndex(1), sequence, orcTypes, type.getTypeParameters().get(1), columnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter);
return new MapColumnWriter(nodeIndex, sequence, columnWriterOptions, dwrfEncryptor, orcEncoding, keyWriter, valueWriter, metadataWriter);
}
case STRUCT:
{
ImmutableList.Builder<ColumnWriter> fieldWriters = ImmutableList.builder();
for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) {
int childNodeIndex = orcType.getFieldTypeIndex(fieldId);
Type fieldType = type.getTypeParameters().get(fieldId);
fieldWriters.add(createColumnWriter(childNodeIndex, sequence, orcTypes, fieldType, columnWriterOptions, orcEncoding, hiveStorageTimeZone, dwrfEncryptors, metadataWriter));
}
return new StructColumnWriter(nodeIndex, sequence, columnWriterOptions, dwrfEncryptor, fieldWriters.build(), metadataWriter);
}
}
throw new IllegalArgumentException("Unsupported type: " + type);
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class DoubleColumnWriter method finishRowGroup.
@Override
public Map<Integer, ColumnStatistics> finishRowGroup() {
checkState(!closed);
ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
rowGroupColumnStatistics.add(statistics);
columnStatisticsRetainedSizeInBytes += statistics.getRetainedSizeInBytes();
statisticsBuilder = new DoubleStatisticsBuilder();
return ImmutableMap.of(column, statistics);
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class FloatColumnWriter method finishRowGroup.
@Override
public Map<Integer, ColumnStatistics> finishRowGroup() {
checkState(!closed);
ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
rowGroupColumnStatistics.add(statistics);
columnStatisticsRetainedSizeInBytes += statistics.getRetainedSizeInBytes();
statisticsBuilder = new DoubleStatisticsBuilder();
return ImmutableMap.of(column, statistics);
}
use of com.facebook.presto.orc.metadata.statistics.ColumnStatistics in project urban-eureka by errir503.
the class SliceDirectColumnWriter method finishRowGroup.
@Override
public Map<Integer, ColumnStatistics> finishRowGroup() {
checkState(!closed);
ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
rowGroupColumnStatistics.add(statistics);
columnStatisticsRetainedSizeInBytes += statistics.getRetainedSizeInBytes();
statisticsBuilder = statisticsBuilderSupplier.get();
return ImmutableMap.of(column, statistics);
}
Aggregations