use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class ListColumnWriter method finishRowGroup.
@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
checkState(!closed);
ColumnStatistics statistics = new ColumnStatistics((long) nonNullValueCount, 0, null, null, null, null, null, null, null, null);
rowGroupColumnStatistics.add(statistics);
nonNullValueCount = 0;
ImmutableMap.Builder<OrcColumnId, ColumnStatistics> columnStatistics = ImmutableMap.builder();
columnStatistics.put(columnId, statistics);
columnStatistics.putAll(elementWriter.finishRowGroup());
return columnStatistics.build();
}
use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class OrcWriteValidation method validateRowGroupStatistics.
public void validateRowGroupStatistics(OrcDataSourceId orcDataSourceId, long stripeOffset, Map<StreamId, List<RowGroupIndex>> actualRowGroupStatistics) throws OrcCorruptionException {
requireNonNull(actualRowGroupStatistics, "actualRowGroupStatistics is null");
List<RowGroupStatistics> expectedRowGroupStatistics = rowGroupStatistics.get(stripeOffset);
if (expectedRowGroupStatistics == null) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected stripe at offset %s", stripeOffset);
}
int rowGroupCount = expectedRowGroupStatistics.size();
for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
if (entry.getValue().size() != rowGroupCount) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected row group count stripe in at offset %s", stripeOffset);
}
}
for (int rowGroupIndex = 0; rowGroupIndex < expectedRowGroupStatistics.size(); rowGroupIndex++) {
RowGroupStatistics expectedRowGroup = expectedRowGroupStatistics.get(rowGroupIndex);
if (expectedRowGroup.getValidationMode() != HASHED) {
Map<OrcColumnId, ColumnStatistics> expectedStatistics = expectedRowGroup.getColumnStatistics();
Set<OrcColumnId> actualColumns = actualRowGroupStatistics.keySet().stream().map(StreamId::getColumnId).collect(Collectors.toSet());
if (!expectedStatistics.keySet().equals(actualColumns)) {
throw new OrcCorruptionException(orcDataSourceId, "Unexpected column in row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
}
for (Entry<StreamId, List<RowGroupIndex>> entry : actualRowGroupStatistics.entrySet()) {
ColumnStatistics actual = entry.getValue().get(rowGroupIndex).getColumnStatistics();
ColumnStatistics expected = expectedStatistics.get(entry.getKey().getColumnId());
validateColumnStatisticsEquivalent(orcDataSourceId, "Row group " + rowGroupIndex + " in stripe at offset " + stripeOffset, actual, expected);
}
}
if (expectedRowGroup.getValidationMode() != DETAILED) {
RowGroupStatistics actualRowGroup = buildActualRowGroupStatistics(rowGroupIndex, actualRowGroupStatistics);
if (expectedRowGroup.getHash() != actualRowGroup.getHash()) {
throw new OrcCorruptionException(orcDataSourceId, "Checksum mismatch for row group %s in stripe at offset %s", rowGroupIndex, stripeOffset);
}
}
}
}
use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class OrcWriteValidation method validateColumnStatisticsEquivalent.
private static void validateColumnStatisticsEquivalent(OrcDataSourceId orcDataSourceId, String name, ColumnMetadata<ColumnStatistics> actualColumnStatistics, ColumnMetadata<ColumnStatistics> expectedColumnStatistics) throws OrcCorruptionException {
requireNonNull(name, "name is null");
requireNonNull(actualColumnStatistics, "actualColumnStatistics is null");
requireNonNull(expectedColumnStatistics, "expectedColumnStatistics is null");
if (actualColumnStatistics.size() != expectedColumnStatistics.size()) {
throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected number of columns in %s statistics", name);
}
for (int i = 0; i < actualColumnStatistics.size(); i++) {
OrcColumnId columnId = new OrcColumnId(i);
ColumnStatistics actual = actualColumnStatistics.get(columnId);
ColumnStatistics expected = expectedColumnStatistics.get(columnId);
validateColumnStatisticsEquivalent(orcDataSourceId, name + " column " + i, actual, expected);
}
}
use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class OrcWriter method toFileStats.
private static Optional<ColumnMetadata<ColumnStatistics>> toFileStats(List<ColumnMetadata<ColumnStatistics>> stripes) {
if (stripes.isEmpty()) {
return Optional.empty();
}
int columnCount = stripes.get(0).size();
checkArgument(stripes.stream().allMatch(stripe -> columnCount == stripe.size()));
ImmutableList.Builder<ColumnStatistics> fileStats = ImmutableList.builder();
for (int i = 0; i < columnCount; i++) {
OrcColumnId columnId = new OrcColumnId(i);
fileStats.add(ColumnStatistics.mergeColumnStatistics(stripes.stream().map(stripe -> stripe.get(columnId)).collect(toList())));
}
return Optional.of(new ColumnMetadata<>(fileStats.build()));
}
use of io.prestosql.orc.metadata.OrcColumnId in project hetu-core by openlookeng.
the class OrcWriter method bufferStripeData.
/**
* Collect the data for for the stripe. This is not the actual data, but
* instead are functions that know how to write the data.
*/
private List<OrcDataOutput> bufferStripeData(long stripeStartOffset, FlushReason flushReason) throws IOException {
if (stripeRowCount == 0) {
verify(flushReason == CLOSED, "An empty stripe is not allowed");
// column writers must be closed or the reset call will fail
columnWriters.forEach(ColumnWriter::close);
return ImmutableList.of();
}
if (rowGroupRowCount > 0) {
finishRowGroup();
}
// convert any dictionary encoded column with a low compression ratio to direct
dictionaryCompressionOptimizer.finalOptimize(bufferedBytes);
columnWriters.forEach(ColumnWriter::close);
List<OrcDataOutput> outputData = new ArrayList<>();
List<Stream> allStreams = new ArrayList<>(columnWriters.size() * 3);
// get index streams
long indexLength = 0;
for (ColumnWriter columnWriter : columnWriters) {
for (StreamDataOutput indexStream : columnWriter.getIndexStreams(metadataWriter)) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(indexStream);
allStreams.add(indexStream.getStream());
indexLength += indexStream.size();
}
}
// data streams (sorted by size)
long dataLength = 0;
List<StreamDataOutput> dataStreams = new ArrayList<>(columnWriters.size() * 2);
for (ColumnWriter columnWriter : columnWriters) {
List<StreamDataOutput> streams = columnWriter.getDataStreams();
dataStreams.addAll(streams);
dataLength += streams.stream().mapToLong(StreamDataOutput::size).sum();
}
Collections.sort(dataStreams);
// add data streams
for (StreamDataOutput dataStream : dataStreams) {
// The ordering is critical because the stream only contain a length with no offset.
outputData.add(dataStream);
allStreams.add(dataStream.getStream());
}
Map<OrcColumnId, ColumnEncoding> columnEncodings = new HashMap<>();
columnWriters.forEach(columnWriter -> columnEncodings.putAll(columnWriter.getColumnEncodings()));
Map<OrcColumnId, ColumnStatistics> columnStatistics = new HashMap<>();
columnWriters.forEach(columnWriter -> columnStatistics.putAll(columnWriter.getColumnStripeStatistics()));
// the 0th column is a struct column for the whole row
columnEncodings.put(ROOT_COLUMN, new ColumnEncoding(DIRECT, 0));
columnStatistics.put(ROOT_COLUMN, new ColumnStatistics((long) stripeRowCount, 0, null, null, null, null, null, null, null, null));
// add footer
StripeFooter stripeFooter = new StripeFooter(allStreams, toColumnMetadata(columnEncodings, orcTypes.size()), ZoneId.of("UTC"));
Slice footer = metadataWriter.writeStripeFooter(stripeFooter);
outputData.add(createDataOutput(footer));
// create final stripe statistics
StripeStatistics statistics = new StripeStatistics(toColumnMetadata(columnStatistics, orcTypes.size()));
recordValidation(validation -> validation.addStripeStatistics(stripeStartOffset, statistics));
StripeInformation stripeInformation = new StripeInformation(stripeRowCount, stripeStartOffset, indexLength, dataLength, footer.length());
ClosedStripe closedStripe = new ClosedStripe(stripeInformation, statistics);
closedStripes.add(closedStripe);
closedStripesRetainedBytes += closedStripe.getRetainedSizeInBytes();
recordValidation(validation -> validation.addStripe(stripeInformation.getNumberOfRows()));
stats.recordStripeWritten(flushReason, stripeInformation.getTotalLength(), stripeInformation.getNumberOfRows(), dictionaryCompressionOptimizer.getDictionaryMemoryBytes());
return outputData;
}
Aggregations