use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class TestBooleanStream method testWriteMultiple.
@Test
public void testWriteMultiple() throws IOException {
BooleanOutputStream outputStream = createValueOutputStream();
for (int i = 0; i < 3; i++) {
outputStream.reset();
BooleanList expectedValues = new BooleanArrayList(1024);
outputStream.writeBooleans(32, true);
expectedValues.addAll(Collections.nCopies(32, true));
outputStream.writeBooleans(32, false);
expectedValues.addAll(Collections.nCopies(32, false));
outputStream.writeBooleans(1, true);
expectedValues.add(true);
outputStream.writeBooleans(1, false);
expectedValues.add(false);
outputStream.writeBooleans(34, true);
expectedValues.addAll(Collections.nCopies(34, true));
outputStream.writeBooleans(34, false);
expectedValues.addAll(Collections.nCopies(34, false));
outputStream.writeBoolean(true);
expectedValues.add(true);
outputStream.writeBoolean(false);
expectedValues.add(false);
outputStream.close();
DynamicSliceOutput sliceOutput = new DynamicSliceOutput(1000);
StreamDataOutput streamDataOutput = outputStream.getStreamDataOutput(new OrcColumnId(33));
streamDataOutput.writeData(sliceOutput);
Stream stream = streamDataOutput.getStream();
assertEquals(stream.getStreamKind(), StreamKind.DATA);
assertEquals(stream.getColumnId(), new OrcColumnId(33));
assertEquals(stream.getLength(), sliceOutput.size());
BooleanInputStream valueStream = createValueStream(sliceOutput.slice());
for (int index = 0; index < expectedValues.size(); index++) {
boolean expectedValue = expectedValues.getBoolean(index);
boolean actualValue = readValue(valueStream);
assertEquals(actualValue, expectedValue);
}
}
}
use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class StripeReader method createDictionaryStreamSources.
private InputStreamSources createDictionaryStreamSources(Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, ColumnMetadata<ColumnEncoding> columnEncodings) {
ImmutableMap.Builder<StreamId, InputStreamSource<?>> dictionaryStreamBuilder = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
OrcColumnId column = stream.getColumnId();
// only process dictionary streams
ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind();
if (!isDictionary(stream, columnEncoding)) {
continue;
}
// skip streams without data
ValueInputStream<?> valueStream = valueStreams.get(streamId);
if (valueStream == null) {
continue;
}
OrcTypeKind columnType = types.get(stream.getColumnId()).getOrcTypeKind();
StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding);
InputStreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
dictionaryStreamBuilder.put(streamId, streamSource);
}
return new InputStreamSources(dictionaryStreamBuilder.build());
}
use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class StripeReader method readStripe.
public Stripe readStripe(StripeInformation stripe, AggregatedMemoryContext systemMemoryUsage) throws IOException {
// read the stripe footer
OrcStripeFooterCacheKey cacheKey = new OrcStripeFooterCacheKey();
cacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
cacheKey.setStripeOffset(stripe.getOffset());
StripeFooter stripeFooter;
if (orcCacheProperties.isStripeFooterCacheEnabled()) {
try {
stripeFooter = orcCacheStore.getStripeFooterCache().get(cacheKey, () -> this.readStripeFooter(stripe, systemMemoryUsage));
} catch (UncheckedExecutionException | ExecutionException executionException) {
handleCacheLoadException(executionException);
log.debug(executionException.getCause(), "Error while caching ORC stripe footer. Falling back to default flow");
stripeFooter = readStripeFooter(stripe, systemMemoryUsage);
}
} else {
stripeFooter = readStripeFooter(stripe, systemMemoryUsage);
}
ColumnMetadata<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings();
if (writeValidation.isPresent()) {
writeValidation.get().validateTimeZone(orcDataSource.getId(), stripeFooter.getTimeZone());
}
ZoneId fileTimeZone = stripeFooter.getTimeZone();
// get streams for selected columns
Map<StreamId, Stream> streams = new HashMap<>();
for (Stream stream : stripeFooter.getStreams()) {
if (includedOrcColumnIds.contains(stream.getColumnId()) && isSupportedStreamType(stream, types.get(stream.getColumnId()).getOrcTypeKind())) {
streams.put(new StreamId(stream), stream);
}
}
// handle stripes with more than one row group
boolean invalidCheckPoint = false;
if (stripe.getNumberOfRows() > rowsInRowGroup) {
// determine ranges of the stripe to read
Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams());
diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet()));
// read the file regions
Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
// read the bloom filter for each column
Map<OrcColumnId, List<HashableBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData, stripe);
// read the row index for each column
Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes, stripe);
if (writeValidation.isPresent()) {
writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes);
}
// select the row groups matching the tuple domain
Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
// if all row groups are skipped, return null
if (selectedRowGroups.isEmpty()) {
// set accounted memory usage to zero
systemMemoryUsage.close();
return null;
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row groups
try {
List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), streams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, rowGroups, dictionaryStreamSources);
} catch (InvalidCheckpointException e) {
// The ORC file contains a corrupt checkpoint stream treat the stripe as a single row group.
invalidCheckPoint = true;
}
}
// stripe only has one row group
ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) {
StreamId streamId = entry.getKey();
if (streams.containsKey(streamId)) {
diskRangesBuilder.put(entry);
}
}
ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
// read the file regions
Map<StreamId, OrcChunkLoader> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
long minAverageRowBytes = 0;
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
if (entry.getKey().getStreamKind() == ROW_INDEX) {
List<RowGroupIndex> rowGroupIndexes;
if (orcCacheProperties.isRowIndexCacheEnabled()) {
OrcRowIndexCacheKey indexCacheKey = new OrcRowIndexCacheKey();
indexCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
indexCacheKey.setStripeOffset(stripe.getOffset());
indexCacheKey.setStreamId(entry.getKey());
try {
rowGroupIndexes = orcCacheStore.getRowIndexCache().get(indexCacheKey, () -> metadataReader.readRowIndexes(hiveWriterVersion, new OrcInputStream(streamsData.get(entry.getKey()))));
} catch (UncheckedExecutionException | ExecutionException executionException) {
handleCacheLoadException(executionException);
log.debug(executionException.getCause(), "Error while caching row group indexes. Falling back to default flow");
rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, new OrcInputStream(streamsData.get(entry.getKey())));
}
} else {
rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, new OrcInputStream(streamsData.get(entry.getKey())));
}
checkState(rowGroupIndexes.size() == 1 || invalidCheckPoint, "expect a single row group or an invalid check point");
long totalBytes = 0;
long totalRows = 0;
for (RowGroupIndex rowGroupIndex : rowGroupIndexes) {
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics();
if (columnStatistics.hasMinAverageValueSizeInBytes()) {
totalBytes += columnStatistics.getMinAverageValueSizeInBytes() * columnStatistics.getNumberOfValues();
totalRows += columnStatistics.getNumberOfValues();
}
}
if (totalRows > 0) {
minAverageRowBytes += totalBytes / totalRows;
}
}
}
// value streams
Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row group
ImmutableMap.Builder<StreamId, InputStreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, ValueInputStream<?>> entry : valueStreams.entrySet()) {
builder.put(entry.getKey(), new ValueInputStreamSource<>(entry.getValue()));
}
RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), minAverageRowBytes, new InputStreamSources(builder.build()));
return new Stripe(stripe.getNumberOfRows(), fileTimeZone, columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class Checkpoints method getStreamCheckpoints.
public static Map<StreamId, StreamCheckpoint> getStreamCheckpoints(Set<OrcColumnId> columns, ColumnMetadata<OrcType> columnTypes, boolean compressed, int rowGroupId, ColumnMetadata<ColumnEncoding> columnEncodings, Map<StreamId, Stream> streams, Map<StreamId, List<RowGroupIndex>> columnIndexes) throws InvalidCheckpointException {
ImmutableSetMultimap.Builder<OrcColumnId, StreamKind> streamKindsBuilder = ImmutableSetMultimap.builder();
for (Stream stream : streams.values()) {
streamKindsBuilder.put(stream.getColumnId(), stream.getStreamKind());
}
SetMultimap<OrcColumnId, StreamKind> streamKinds = streamKindsBuilder.build();
ImmutableMap.Builder<StreamId, StreamCheckpoint> checkpoints = ImmutableMap.builder();
for (Map.Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) {
OrcColumnId columnId = entry.getKey().getColumnId();
if (!columns.contains(columnId)) {
continue;
}
List<Integer> positionsList = entry.getValue().get(rowGroupId).getPositions();
ColumnEncodingKind columnEncoding = columnEncodings.get(columnId).getColumnEncodingKind();
OrcTypeKind columnType = columnTypes.get(columnId).getOrcTypeKind();
Set<StreamKind> availableStreams = streamKinds.get(columnId);
ColumnPositionsList columnPositionsList = new ColumnPositionsList(columnId, columnType, positionsList);
switch(columnType) {
case BOOLEAN:
checkpoints.putAll(getBooleanColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
break;
case BYTE:
checkpoints.putAll(getByteColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
break;
case SHORT:
case INT:
case LONG:
case DATE:
checkpoints.putAll(getLongColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
break;
case FLOAT:
checkpoints.putAll(getFloatColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
break;
case DOUBLE:
checkpoints.putAll(getDoubleColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
break;
case TIMESTAMP:
checkpoints.putAll(getTimestampColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
break;
case BINARY:
case STRING:
case VARCHAR:
case CHAR:
checkpoints.putAll(getSliceColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
break;
case LIST:
case MAP:
checkpoints.putAll(getListOrMapColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
break;
case STRUCT:
checkpoints.putAll(getStructColumnCheckpoints(columnId, compressed, availableStreams, columnPositionsList));
break;
case DECIMAL:
checkpoints.putAll(getDecimalColumnCheckpoints(columnId, columnEncoding, compressed, availableStreams, columnPositionsList));
break;
default:
throw new IllegalArgumentException("Unsupported column type " + columnType);
}
}
return checkpoints.build();
}
use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class TestAbstractNumbericColumnReader method testTypeCoercionInteger.
@Test
public void testTypeCoercionInteger() throws OrcCorruptionException {
OrcColumn column = new OrcColumn("hdfs://hacluster/user/hive/warehouse/tpcds_orc_hive_1000.db/catalog_sales/cs_sold_date_sk=2452268/000896_0", new OrcColumnId(3), "cs_order_number", OrcType.OrcTypeKind.INT, new OrcDataSourceId("hdfs://hacluster/user/hive/warehouse/tpcds_orc_hive_1000.db/catalog_sales/cs_sold_date_sk=2452268/000896_0"), ImmutableList.of());
ColumnReader actualIntegerColumnReader = ColumnReaders.createColumnReader(type, column, AggregatedMemoryContext.newSimpleAggregatedMemoryContext(), null);
IntegerColumnReader expectedIntegerColumnReader = new IntegerColumnReader(type, column, AggregatedMemoryContext.newSimpleAggregatedMemoryContext().newLocalMemoryContext(ColumnReaders.class.getSimpleName()));
assertEquals(actualIntegerColumnReader.toString(), expectedIntegerColumnReader.toString());
}
Aggregations