use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.
the class SliceDictionaryColumnWriter method writeDictionary.
@Override
protected Optional<int[]> writeDictionary() {
ColumnEncodingKind encodingKind = orcEncoding == DWRF ? DICTIONARY : DICTIONARY_V2;
int dictionaryEntryCount = dictionary.getEntryCount();
columnEncoding = new ColumnEncoding(encodingKind, dictionaryEntryCount);
if (sortDictionaryKeys) {
return writeSortedDictionary();
} else {
for (int i = 0; i < dictionaryEntryCount; i++) {
writeDictionaryEntry(i);
}
return Optional.empty();
}
}
use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.
the class OrcRecordReader method advanceToNextStripe.
private void advanceToNextStripe() throws IOException {
currentStripeSystemMemoryContext.close();
currentStripeSystemMemoryContext = systemMemoryUsage.newAggregatedMemoryContext();
rowGroups = ImmutableList.<RowGroup>of().iterator();
currentStripe++;
if (currentStripe >= stripes.size()) {
return;
}
if (currentStripe > 0) {
currentStripePosition += stripes.get(currentStripe - 1).getNumberOfRows();
}
StripeInformation stripeInformation = stripes.get(currentStripe);
Stripe stripe = stripeReader.readStripe(stripeInformation, currentStripeSystemMemoryContext);
if (stripe != null) {
// Give readers access to dictionary streams
StreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources();
List<ColumnEncoding> columnEncodings = stripe.getColumnEncodings();
for (StreamReader column : streamReaders) {
if (column != null) {
column.startStripe(dictionaryStreamSources, columnEncodings);
}
}
rowGroups = stripe.getRowGroups().iterator();
}
}
use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.
the class StripeReader method readStripe.
public Stripe readStripe(StripeInformation stripe, AggregatedMemoryContext systemMemoryUsage) throws IOException {
// read the stripe footer
StripeFooter stripeFooter = readStripeFooter(stripe, systemMemoryUsage);
List<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings();
// get streams for selected columns
Map<StreamId, Stream> streams = new HashMap<>();
boolean hasRowGroupDictionary = false;
for (Stream stream : stripeFooter.getStreams()) {
if (includedOrcColumns.contains(stream.getColumn())) {
streams.put(new StreamId(stream), stream);
ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind();
if (columnEncoding == DICTIONARY && stream.getStreamKind() == StreamKind.IN_DICTIONARY) {
hasRowGroupDictionary = true;
}
}
}
// handle stripes with more than one row group or a dictionary
if ((stripe.getNumberOfRows() > rowsInRowGroup) || hasRowGroupDictionary) {
// determine ranges of the stripe to read
Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams());
diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet()));
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
// read the bloom filter for each column
Map<Integer, List<HiveBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
// read the row index for each column
Map<Integer, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes);
// select the row groups matching the tuple domain
Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes);
// if all row groups are skipped, return null
if (selectedRowGroups.isEmpty()) {
// set accounted memory usage to zero
systemMemoryUsage.close();
return null;
}
// value streams
Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row groups
try {
List<RowGroup> rowGroups = createRowGroups(stripe.getNumberOfRows(), streams, valueStreams, columnIndexes, selectedRowGroups, columnEncodings);
return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources);
} catch (InvalidCheckpointException e) {
// we must fail because the length of the row group dictionary is contained in the checkpoint stream.
if (hasRowGroupDictionary) {
throw new OrcCorruptionException(e, "ORC file %s has corrupt checkpoints", orcDataSource);
}
}
}
// stripe only has one row group and no dictionary
ImmutableMap.Builder<StreamId, DiskRange> diskRangesBuilder = ImmutableMap.builder();
for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) {
StreamId streamId = entry.getKey();
if (streamId.getStreamKind() != ROW_INDEX && streams.keySet().contains(streamId)) {
diskRangesBuilder.put(entry);
}
}
ImmutableMap<StreamId, DiskRange> diskRanges = diskRangesBuilder.build();
// read the file regions
Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
// value streams
Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings);
// build the dictionary streams
StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
// build the row group
ImmutableMap.Builder<StreamId, StreamSource<?>> builder = ImmutableMap.builder();
for (Entry<StreamId, ValueStream<?>> entry : valueStreams.entrySet()) {
builder.put(entry.getKey(), new ValueStreamSource<>(entry.getValue()));
}
RowGroup rowGroup = new RowGroup(0, 0, stripe.getNumberOfRows(), new StreamSources(builder.build()));
return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources);
}
use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.
the class TestDictionaryColumnWriter method verifyDwrfDirectEncoding.
private void verifyDwrfDirectEncoding(int stripeCount, List<StripeFooter> stripeFooters) {
assertEquals(stripeFooters.size(), stripeCount);
for (StripeFooter footer : stripeFooters) {
ColumnEncoding encoding = footer.getColumnEncodings().get(COLUMN_ID);
assertEquals(encoding.getColumnEncodingKind(), DWRF_DIRECT);
}
}
use of com.facebook.presto.orc.metadata.ColumnEncoding in project presto by prestodb.
the class OrcWriter method createEncryptedGroups.
private List<Slice> createEncryptedGroups(Multimap<Integer, Stream> encryptedStreams, Map<Integer, ColumnEncoding> encryptedColumnEncodings) throws IOException {
ImmutableList.Builder<Slice> encryptedGroups = ImmutableList.builder();
for (int i = 0; i < encryptedStreams.keySet().size(); i++) {
int groupId = i;
Map<Integer, ColumnEncoding> groupColumnEncodings = encryptedColumnEncodings.entrySet().stream().filter(entry -> dwrfEncryptionInfo.getGroupByNodeId(entry.getKey()).orElseThrow(() -> new VerifyError("missing group for encryptedColumn")) == groupId).collect(toImmutableMap(Entry::getKey, Entry::getValue));
DwrfDataEncryptor dwrfDataEncryptor = dwrfEncryptionInfo.getEncryptorByGroupId(i);
OrcOutputBuffer buffer = new OrcOutputBuffer(columnWriterOptions, Optional.of(dwrfDataEncryptor));
toStripeEncryptionGroup(new StripeEncryptionGroup(ImmutableList.copyOf(encryptedStreams.get(i)), groupColumnEncodings)).writeTo(buffer);
buffer.close();
DynamicSliceOutput output = new DynamicSliceOutput(toIntExact(buffer.getOutputDataSize()));
buffer.writeDataTo(output);
encryptedGroups.add(output.slice());
}
return encryptedGroups.build();
}
Aggregations