use of com.facebook.presto.orc.stream.LongInputStream in project presto by prestodb.
the class TestSliceDictionaryColumnWriter method getDictionaryKeys.
private List<String> getDictionaryKeys(List<String> values, OrcEncoding orcEncoding, boolean sortDictionaryKeys) throws IOException {
DictionaryColumnWriter writer = getDictionaryColumnWriter(orcEncoding, sortDictionaryKeys);
for (int index = 0; index < values.size(); ) {
int endIndex = Math.min(index + 10_000, values.size());
BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(null, 10_000);
while (index < endIndex) {
VARCHAR.writeSlice(blockBuilder, utf8Slice(values.get(index++)));
}
writer.beginRowGroup();
writer.writeBlock(blockBuilder);
writer.finishRowGroup();
}
writer.close();
List<StreamDataOutput> streams = writer.getDataStreams();
int dictionarySize = writer.getColumnEncodings().get(COLUMN_ID).getDictionarySize();
ByteArrayInputStream dictionaryDataStream = new ByteArrayInputStream(getOrcInputStream(streams, DICTIONARY_DATA));
LongInputStream dictionaryLengthStream = getDictionaryLengthStream(streams, orcEncoding);
List<String> dictionaryKeys = new ArrayList<>(dictionarySize);
for (int i = 0; i < dictionarySize; i++) {
int length = toIntExact(dictionaryLengthStream.next());
String dictionaryKey = new String(dictionaryDataStream.next(length), UTF_8);
dictionaryKeys.add(dictionaryKey);
}
return dictionaryKeys;
}
use of com.facebook.presto.orc.stream.LongInputStream in project presto by prestodb.
the class LongDictionaryProvider method getDictionary.
/**
* Loads a dictionary from a stream and attempts to reuse the dictionary buffer passed in.
*
* @param streamDescriptor descriptor indicating node and sequence of the stream reader
* the dictionary is associated with.
* @param dictionary dictionary buffer the method attempts to fill.
* @param items number of items expected in the dictionary.
* @return The DictionaryResult contains two parts:
* 1) the final dictionary buffer object. Different from the input dictionary buffer if the input
* dictionary buffer is expanded or that the method returns a shared dictionary.
* 2) whether the caller will be the owner of the dictionary, for the purpose of memory accounting.
* Callers own all non-shared dictionaries, and only the first caller of the shared dictionary
* is the owner.
* @throws IOException
*/
public DictionaryResult getDictionary(StreamDescriptor streamDescriptor, long[] dictionary, int items) throws IOException {
InputStreamSource<LongInputStream> dictionaryDataStream = dictionaryStreamSources.getInputStreamSource(streamDescriptor, DICTIONARY_DATA, LongInputStream.class);
// Fetch non-shared dictionaries.
if (dictionaryDataStream.openStream() != null) {
return loadDictionary(streamDescriptor, dictionaryDataStream, dictionary, items);
}
// Try fetching shared dictionaries
int streamId = streamDescriptor.getStreamId();
SharedDictionary sharedDictionary = sharedDictionaries.get(streamId);
boolean isNewEntry = sharedDictionary == null;
if (isNewEntry) {
StreamDescriptor sharedDictionaryStreamDescriptor = streamDescriptor.duplicate(DEFAULT_SEQUENCE_ID);
InputStreamSource<LongInputStream> sharedDictionaryDataStream = dictionaryStreamSources.getInputStreamSource(sharedDictionaryStreamDescriptor, DICTIONARY_DATA, LongInputStream.class);
long[] dictionaryBuffer = loadDictionary(streamDescriptor, sharedDictionaryDataStream, dictionary, items).dictionaryBuffer();
sharedDictionary = new SharedDictionary(dictionaryBuffer, items);
sharedDictionaries.put(streamId, sharedDictionary);
}
checkState(sharedDictionary.size == items, "Shared dictionary size mismatch for stream: %s", streamDescriptor);
return new DictionaryResult(sharedDictionary.values, isNewEntry);
}
use of com.facebook.presto.orc.stream.LongInputStream in project presto by prestodb.
the class SliceDictionarySelectiveReader method openRowGroup.
private void openRowGroup() throws IOException {
// read the dictionary
if (!stripeDictionaryOpen) {
if (stripeDictionarySize > 0) {
// resize the dictionary lengths array if necessary
if (stripeDictionaryLength.length < stripeDictionarySize) {
stripeDictionaryLength = new int[stripeDictionarySize];
}
// read the lengths
LongInputStream lengthStream = stripeDictionaryLengthStreamSource.openStream();
if (lengthStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Dictionary is not empty but dictionary length stream is not present");
}
lengthStream.nextIntVector(stripeDictionarySize, stripeDictionaryLength, 0);
long dataLength = 0;
for (int i = 0; i < stripeDictionarySize; i++) {
dataLength += stripeDictionaryLength[i];
}
dictionaryData = ensureCapacity(dictionaryData, toIntExact(dataLength));
dictionaryOffsetVector = ensureCapacity(dictionaryOffsetVector, stripeDictionarySize + 2);
// read dictionary values
ByteArrayInputStream dictionaryDataStream = stripeDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, stripeDictionarySize, stripeDictionaryLength, 0, dictionaryData, dictionaryOffsetVector, maxCodePointCount, isCharType);
} else {
dictionaryData = EMPTY_DICTIONARY_DATA;
dictionaryOffsetVector = EMPTY_DICTIONARY_OFFSETS;
}
// If there is no rowgroup dictionary, we only need to wrap the stripe dictionary once per stripe because wrapping dictionary is very expensive.
dictionaryWrapped = false;
}
// read row group dictionary
RowGroupDictionaryLengthInputStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream();
if (dictionaryLengthStream != null) {
int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount();
rowGroupDictionaryLength = ensureCapacity(rowGroupDictionaryLength, rowGroupDictionarySize);
// read the lengths
dictionaryLengthStream.nextIntVector(rowGroupDictionarySize, rowGroupDictionaryLength, 0);
long dataLength = 0;
for (int i = 0; i < rowGroupDictionarySize; i++) {
dataLength += rowGroupDictionaryLength[i];
}
dictionaryData = ensureCapacity(dictionaryData, dictionaryOffsetVector[stripeDictionarySize] + toIntExact(dataLength), MEDIUM, PRESERVE);
dictionaryOffsetVector = ensureCapacity(dictionaryOffsetVector, stripeDictionarySize + rowGroupDictionarySize + 2, MEDIUM, PRESERVE);
dictionaryWrapped = false;
// read dictionary values
ByteArrayInputStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, stripeDictionarySize, dictionaryData, dictionaryOffsetVector, maxCodePointCount, isCharType);
currentDictionarySize = stripeDictionarySize + rowGroupDictionarySize + 1;
initiateEvaluationStatus(stripeDictionarySize + rowGroupDictionarySize + 1);
} else {
// there is no row group dictionary so use the stripe dictionary
currentDictionarySize = stripeDictionarySize + 1;
initiateEvaluationStatus(stripeDictionarySize + 1);
}
dictionaryOffsetVector[currentDictionarySize] = dictionaryOffsetVector[currentDictionarySize - 1];
stripeDictionaryOpen = true;
presentStream = presentStreamSource.openStream();
inDictionaryStream = inDictionaryStreamSource.openStream();
dataStream = dataStreamSource.openStream();
rowGroupOpen = true;
}
use of com.facebook.presto.orc.stream.LongInputStream in project presto by prestodb.
the class SliceDictionaryBatchStreamReader method openRowGroup.
private void openRowGroup() throws IOException {
// read the dictionary
if (!stripeDictionaryOpen) {
if (stripeDictionarySize > 0) {
// resize the dictionary lengths array if necessary
if (stripeDictionaryLength.length < stripeDictionarySize) {
stripeDictionaryLength = new int[stripeDictionarySize];
systemMemoryContext.setBytes(sizeOf(stripeDictionaryLength));
}
// read the lengths
LongInputStream lengthStream = stripeDictionaryLengthStreamSource.openStream();
if (lengthStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Dictionary is not empty but dictionary length stream is not present");
}
lengthStream.next(stripeDictionaryLength, stripeDictionarySize);
long dataLength = 0;
for (int i = 0; i < stripeDictionarySize; i++) {
dataLength += stripeDictionaryLength[i];
}
// we must always create a new dictionary array because the previous dictionary may still be referenced
stripeDictionaryData = new byte[toIntExact(dataLength)];
systemMemoryContext.setBytes(sizeOf(stripeDictionaryData));
// add one extra entry for null
stripeDictionaryOffsetVector = new int[stripeDictionarySize + 2];
systemMemoryContext.setBytes(sizeOf(stripeDictionaryOffsetVector));
// read dictionary values
ByteArrayInputStream dictionaryDataStream = stripeDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, stripeDictionarySize, stripeDictionaryLength, 0, stripeDictionaryData, stripeDictionaryOffsetVector, maxCodePointCount, isCharType);
} else {
stripeDictionaryData = EMPTY_DICTIONARY_DATA;
stripeDictionaryOffsetVector = EMPTY_DICTIONARY_OFFSETS;
}
}
stripeDictionaryOpen = true;
// read row group dictionary
RowGroupDictionaryLengthInputStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream();
if (dictionaryLengthStream != null) {
int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount();
// resize the dictionary lengths array if necessary
if (rowGroupDictionaryLength.length < rowGroupDictionarySize) {
rowGroupDictionaryLength = new int[rowGroupDictionarySize];
}
// read the lengths
dictionaryLengthStream.next(rowGroupDictionaryLength, rowGroupDictionarySize);
long dataLength = 0;
for (int i = 0; i < rowGroupDictionarySize; i++) {
dataLength += rowGroupDictionaryLength[i];
}
// We must always create a new dictionary array because the previous dictionary may still be referenced
// The first elements of the dictionary are from the stripe dictionary, then the row group dictionary elements, and then a null
byte[] rowGroupDictionaryData = Arrays.copyOf(stripeDictionaryData, stripeDictionaryOffsetVector[stripeDictionarySize] + toIntExact(dataLength));
int[] rowGroupDictionaryOffsetVector = Arrays.copyOf(stripeDictionaryOffsetVector, stripeDictionarySize + rowGroupDictionarySize + 2);
// read dictionary values
ByteArrayInputStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, stripeDictionarySize, rowGroupDictionaryData, rowGroupDictionaryOffsetVector, maxCodePointCount, isCharType);
setDictionaryBlockData(rowGroupDictionaryData, rowGroupDictionaryOffsetVector, stripeDictionarySize + rowGroupDictionarySize + 1);
} else {
// there is no row group dictionary so use the stripe dictionary
setDictionaryBlockData(stripeDictionaryData, stripeDictionaryOffsetVector, stripeDictionarySize + 1);
}
presentStream = presentStreamSource.openStream();
inDictionaryStream = inDictionaryStreamSource.openStream();
dataStream = dataStreamSource.openStream();
rowGroupOpen = true;
}
use of com.facebook.presto.orc.stream.LongInputStream in project presto by prestodb.
the class LongDictionaryProvider method loadDictionary.
private DictionaryResult loadDictionary(StreamDescriptor streamDescriptor, InputStreamSource<LongInputStream> dictionaryDataStream, long[] dictionaryBuffer, int items) throws IOException {
// We construct and use the input stream exactly once per stream descriptor per stripe, so we don't
// really need to cache it.
LongInputStream inputStream = dictionaryDataStream.openStream();
if (inputStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Dictionary is not empty but data stream is not present for %s", streamDescriptor);
}
if (dictionaryBuffer == null || dictionaryBuffer.length < items) {
dictionaryBuffer = new long[items];
}
inputStream.next(dictionaryBuffer, items);
return new DictionaryResult(dictionaryBuffer, true);
}
Aggregations