use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.
the class OrcWriter method bufferStripeData.
/**
* Collect the data for for the stripe. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
if (stripeRowCount == 0) {
verify(flushReason == CLOSED, "An empty stripe is not allowed");
// column writers must be closed or the reset call will fail
columnWriters.forEach(ColumnWriter::close);
return ImmutableList.of();
}
if (rowGroupRowCount > 0) {
finishRowGroup();
}
// convert any dictionary encoded column with a low compression ratio to direct
dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
columnWriters.forEach(ColumnWriter::close);
List<OrcDataOutput> outputData = new ArrayList<>();
List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
// get index streams
long indexLength = 0;
for (ColumnWriter columnWriter : columnWriters) {
for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(indexStream);
allStreams.add(indexStream.getStream());
indexLength += indexStream.size();
}
}
// data streams (sorted by size)
long dataLength = 0;
List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
for (ColumnWriter columnWriter : columnWriters) {
List<StreamDataOutput> streams = columnWriter.getDataStreams();
dataStreams.addAll(streams);
dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
}
Collections.sort(dataStreams);
// add data streams
for (StreamDataOutput dataStream : dataStreams) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(dataStream);
allStreams.add(dataStream.getStream());
}
Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
// the 0th column is a struct column for the whole row
columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null));
// add footer
StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
outputData.add(createDataOutput(footer));
// create final stripe statistics
StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
closedStripes.add(closedStripe);
closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
return outputData;
}
use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.
the class OrcWriter method bufferFileFooter.
/**
* Collect the data for for the file footer. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<OrcDataOutput> bufferFileFooter() throws IOException {
if (preCloseCallback.isPresent()) {
try {
preCloseCallback.get().call();
} catch (Exception e) {
log.debug("Call pre close call back error");
}
}
List<OrcDataOutput> outputData = new ArrayList<>();
Metadata metadata = new Metadata(closedStripes.stream().map(ClosedStripe::getStatistics).map(Optional::of).collect(toList()));
Slice metadataSlice = metadataWriter.writeMetadata(metadata);
outputData.add(createDataOutput(metadataSlice));
long numberOfRows = closedStripes.stream().mapToLong(stripe -> stripe.getStripeInformation().getNumberOfRows()).sum();
Optional<ColumnMetadata<ColumnStatistics>> fileStats = toFileStats(closedStripes.stream().map(ClosedStripe::getStatistics).map(StripeStatistics::getColumnStatistics).collect(toList()));
recordValidation(validation -> validation.setFileStatistics(fileStats));
Map<String, Slice> localUserMetadata = this.userMetadata.entrySet().stream().collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue())));
Footer footer = new Footer(numberOfRows, rowGroupMaxRowCount, closedStripes.stream().map(ClosedStripe::getStripeInformation).collect(toImmutableList()), orcTypes, fileStats, localUserMetadata);
closedStripes.clear();
closedStripesRetainedBytes = 0;
Slice footerSlice = metadataWriter.writeFooter(footer);
outputData.add(createDataOutput(footerSlice));
recordValidation(validation -> validation.setVersion(metadataWriter.getOrcMetadataVersion()));
Slice postscriptSlice = metadataWriter.writePostscript(footerSlice.length(), metadataSlice.length(), compression, maxCompressionBufferSize);
outputData.add(createDataOutput(postscriptSlice));
outputData.add(createDataOutput(Slices.wrappedBuffer(UnsignedBytes.checkedCast(postscriptSlice.length()))));
return outputData;
}
use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.
the class TestBooleanStream method testWriteMultiple.
@Test
public void testWriteMultiple() throws IOException {
BooleanOutputStream outputStream = createValueOutputStream();
for (int i = 0; i < 3; i++) {
outputStream.reset();
BooleanList expectedValues = new BooleanArrayList(1024);
outputStream.writeBooleans(32, true);
expectedValues.addAll(Collections.nCopies(32, true));
outputStream.writeBooleans(32, false);
expectedValues.addAll(Collections.nCopies(32, false));
outputStream.writeBooleans(1, true);
expectedValues.add(true);
outputStream.writeBooleans(1, false);
expectedValues.add(false);
outputStream.writeBooleans(34, true);
expectedValues.addAll(Collections.nCopies(34, true));
outputStream.writeBooleans(34, false);
expectedValues.addAll(Collections.nCopies(34, false));
outputStream.writeBoolean(true);
expectedValues.add(true);
outputStream.writeBoolean(false);
expectedValues.add(false);
outputStream.close();
DynamicSliceOutput sliceOutput = new DynamicSliceOutput(1000);
StreamDataOutput streamDataOutput = outputStream.getStreamDataOutput(new OrcColumnId(33));
streamDataOutput.writeData(sliceOutput);
Stream stream = streamDataOutput.getStream();
assertEquals(stream.getStreamKind(), StreamKind.DATA);
assertEquals(stream.getColumnId(), new OrcColumnId(33));
assertEquals(stream.getLength(), sliceOutput.size());
BooleanInputStream valueStream = createValueStream(sliceOutput.slice());
for (int index = 0; index < expectedValues.size(); index++) {
boolean expectedValue = expectedValues.getBoolean(index);
boolean actualValue = readValue(valueStream);
assertEquals(actualValue, expectedValue);
}
}
}
use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.
the class StripeReader method createValueStreams.
private Map<StreamId, ValueInputStream<?>> createValueStreams(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, ColumnMetadata<ColumnEncoding> columnEncodings) {
ImmutableMap.Builder<StreamId, ValueInputStream<?>> valueStreams = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumnId()).getColumnEncodingKind();
// skip index and empty streams
if (isIndexStream(stream) || stream.getLength() == 0) {
continue;
}
OrcChunkLoader chunkLoader = streamsData.get(streamId);
OrcTypeKind columnType = types.get(stream.getColumnId()).getOrcTypeKind();
valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, chunkLoader, columnType, columnEncoding));
}
return valueStreams.build();
}
use of io.prestosql.orc.metadata.Stream in project hetu-core by openlookeng.
the class StripeReader method createDictionaryStreamSources.
private InputStreamSources createDictionaryStreamSources(Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, ColumnMetadata<ColumnEncoding> columnEncodings) {
ImmutableMap.Builder<StreamId, InputStreamSource<?>> dictionaryStreamBuilder = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
OrcColumnId column = stream.getColumnId();
// only process dictionary streams
ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind();
if (!isDictionary(stream, columnEncoding)) {
continue;
}
// skip streams without data
ValueInputStream<?> valueStream = valueStreams.get(streamId);
if (valueStream == null) {
continue;
}
OrcTypeKind columnType = types.get(stream.getColumnId()).getOrcTypeKind();
StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding);
InputStreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint);
dictionaryStreamBuilder.put(streamId, streamSource);
}
return new InputStreamSources(dictionaryStreamBuilder.build());
}
Aggregations