use of com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind in project presto by prestodb.
the class StripeReader method createValueStreams.
private Map<StreamId, ValueStream<?>> createValueStreams(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, List<ColumnEncoding> columnEncodings) {
ImmutableMap.Builder<StreamId, ValueStream<?>> valueStreams = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind();
// skip index and empty streams
if (isIndexStream(stream) || stream.getLength() == 0) {
continue;
}
OrcInputStream inputStream = streamsData.get(streamId);
OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind();
valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts()));
}
return valueStreams.build();
}
use of com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind in project presto by prestodb.
the class StripeReader method createDictionaryStreamSources.
public StreamSources createDictionaryStreamSources(Map<StreamId, Stream> streams, Map<StreamId, ValueStream<?>> valueStreams, List<ColumnEncoding> columnEncodings) {
ImmutableMap.Builder<StreamId, StreamSource<?>> dictionaryStreamBuilder = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
int column = stream.getColumn();
// only process dictionary streams
ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind();
if (!isDictionary(stream, columnEncoding)) {
continue;
}
// skip streams without data
ValueStream<?> valueStream = valueStreams.get(streamId);
if (valueStream == null) {
continue;
}
OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind();
StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding);
StreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
dictionaryStreamBuilder.put(streamId, streamSource);
}
return new StreamSources(dictionaryStreamBuilder.build());
}
use of com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind in project presto by prestodb.
the class Checkpoints method getStreamCheckpoints.
public static Map<StreamId, StreamCheckpoint> getStreamCheckpoints(Set<Integer> columns, List<OrcType> columnTypes, CompressionKind compressionKind, int rowGroupId, List<ColumnEncoding> columnEncodings, Map<StreamId, Stream> streams, Map<Integer, List<RowGroupIndex>> columnIndexes) throws InvalidCheckpointException {
ImmutableSetMultimap.Builder<Integer, StreamKind> streamKindsBuilder = ImmutableSetMultimap.builder();
for (Stream stream : streams.values()) {
streamKindsBuilder.put(stream.getColumn(), stream.getStreamKind());
}
SetMultimap<Integer, StreamKind> streamKinds = streamKindsBuilder.build();
ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder();
for (int column : columns) {
List<Integer> positionsList = columnIndexes.get(column).get(rowGroupId).getPositions();
ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind();
OrcTypeKind columnType = columnTypes.get(column).getOrcTypeKind();
Set<StreamKind> availableStreams = streamKinds.get(column);
ColumnPositionsList columnPositionsList = new ColumnPositionsList(column, columnType, positionsList);
switch(columnType) {
case BOOLEAN:
checkpoints.putAll(getBooleanColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList));
break;
case BYTE:
checkpoints.putAll(getByteColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList));
break;
case SHORT:
case INT:
case LONG:
case DATE:
checkpoints.putAll(getLongColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList));
break;
case FLOAT:
checkpoints.putAll(getFloatColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList));
break;
case DOUBLE:
checkpoints.putAll(getDoubleColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList));
break;
case TIMESTAMP:
checkpoints.putAll(getTimestampColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList));
break;
case BINARY:
case STRING:
case VARCHAR:
case CHAR:
checkpoints.putAll(getSliceColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList));
break;
case LIST:
case MAP:
checkpoints.putAll(getListOrMapColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList));
break;
case STRUCT:
checkpoints.putAll(getStructColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList));
break;
case DECIMAL:
checkpoints.putAll(getDecimalColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList));
break;
case UNION:
throw new IllegalArgumentException("Unsupported column type " + columnType);
}
// clear the extra offsets
if (columnPositionsList.hasNextPosition() && !Iterables.all(positionsList, equalTo(0))) {
throw new InvalidCheckpointException(format("Column %s, of type %s, contains %s offset positions, but only %s positions were consumed", column, columnType, positionsList.size(), columnPositionsList.getIndex()));
}
}
return checkpoints.build();
}
use of com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind in project presto by prestodb.
the class SliceStreamReader method startStripe.
@Override
public void startStripe(StreamSources dictionaryStreamSources, List<ColumnEncoding> encoding) throws IOException {
ColumnEncodingKind columnEncodingKind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind();
if (columnEncodingKind == DIRECT || columnEncodingKind == DIRECT_V2 || columnEncodingKind == DWRF_DIRECT) {
currentReader = directReader;
} else if (columnEncodingKind == DICTIONARY || columnEncodingKind == DICTIONARY_V2) {
currentReader = dictionaryReader;
} else {
throw new IllegalArgumentException("Unsupported encoding " + columnEncodingKind);
}
currentReader.startStripe(dictionaryStreamSources, encoding);
}
use of com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind in project presto by prestodb.
the class StripeReader method readStripe.
public Stripe readStripe(StripeInformation stripe, AggregatedMemoryContext systemMemoryUsage) throws IOException {
// read the stripe footer
StripeFooter stripeFooter = readStripeFooter(stripe, systemMemoryUsage);
List<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings();
// get streams for selected columns
Map<StreamId, Stream> streams = new HashMap<>();
boolean hasRowGroupDictionary = false;
for (Stream stream : stripeFooter.getStreams()) {
if (includedOrcColumns.contains(stream.getColumn())) {
streams.put(new StreamId(stream), stream);
ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind();
if (columnEncoding == DICTIONARY && stream.getStreamKind() == StreamKind.IN_DICTIONARY) {
hasRowGroupDictionary = true;
}
}
}
// handle stripes with more than one row group or a dictionary
if ((stripe.getNumberOfRows() > rowsInRowGroup) || hasRowGroupDictionary) {
// determine ranges of the stripe to read
Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams());
diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet()));
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
// read the bloom filter for each column
Map<Integer, List<HiveBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
// read the row index for each column
Map<Integer, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes);
// select the row groups matching the tuple domain
Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
// if all row groups are skipped, return null
if (selectedRowGroups.isEmpty()) {
// set accounted memory usage to zero
systemMemoryUsage.close();
return null;
}
// value streams
Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row groups
try {
List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), streams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources);
} catch (InvalidCheckpointException e) {
// we must fail because the length of the row group dictionary is contained in the checkpoint stream.
if (hasRowGroupDictionary) {
throw new OrcCorruptionException(e, "ORC file %s has corrupt checkpoints", orcDataSource);
}
}
}
// stripe only has one row group and no dictionary
ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) {
StreamId streamId = entry.getKey();
if (streamId.getStreamKind() != ROW_INDEX && streams.keySet().contains(streamId)) {
diskRangesBuilder.put(entry);
}
}
ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
// value streams
Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row group
ImmutableMap.Builder<StreamId, StreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, ValueStream<?>> entry : valueStreams.entrySet()) {
builder.put(entry.getKey(), new ValueStreamSource<>(entry.getValue()));
}
RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), new StreamSources(builder.build()));
return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
Aggregations