use of com.facebook.presto.orc.stream.ByteArrayInputStream in project presto by prestodb.
the class TestSliceDictionaryColumnWriter method getDictionaryKeys.
private List<String> getDictionaryKeys(List<String> values, OrcEncoding orcEncoding, boolean sortDictionaryKeys) throws IOException {
DictionaryColumnWriter writer = getDictionaryColumnWriter(orcEncoding, sortDictionaryKeys);
for (int index = 0; index < values.size(); ) {
int endIndex = Math.min(index + 10_000, values.size());
BlockBuilder blockBuilder = VARCHAR.createBlockBuilder(null, 10_000);
while (index < endIndex) {
VARCHAR.writeSlice(blockBuilder, utf8Slice(values.get(index++)));
}
writer.beginRowGroup();
writer.writeBlock(blockBuilder);
writer.finishRowGroup();
}
writer.close();
List<StreamDataOutput> streams = writer.getDataStreams();
int dictionarySize = writer.getColumnEncodings().get(COLUMN_ID).getDictionarySize();
ByteArrayInputStream dictionaryDataStream = new ByteArrayInputStream(getOrcInputStream(streams, DICTIONARY_DATA));
LongInputStream dictionaryLengthStream = getDictionaryLengthStream(streams, orcEncoding);
List<String> dictionaryKeys = new ArrayList<>(dictionarySize);
for (int i = 0; i < dictionarySize; i++) {
int length = toIntExact(dictionaryLengthStream.next());
String dictionaryKey = new String(dictionaryDataStream.next(length), UTF_8);
dictionaryKeys.add(dictionaryKey);
}
return dictionaryKeys;
}
use of com.facebook.presto.orc.stream.ByteArrayInputStream in project presto by prestodb.
the class SliceDictionarySelectiveReader method openRowGroup.
private void openRowGroup() throws IOException {
// read the dictionary
if (!stripeDictionaryOpen) {
if (stripeDictionarySize > 0) {
// resize the dictionary lengths array if necessary
if (stripeDictionaryLength.length < stripeDictionarySize) {
stripeDictionaryLength = new int[stripeDictionarySize];
}
// read the lengths
LongInputStream lengthStream = stripeDictionaryLengthStreamSource.openStream();
if (lengthStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Dictionary is not empty but dictionary length stream is not present");
}
lengthStream.nextIntVector(stripeDictionarySize, stripeDictionaryLength, 0);
long dataLength = 0;
for (int i = 0; i < stripeDictionarySize; i++) {
dataLength += stripeDictionaryLength[i];
}
dictionaryData = ensureCapacity(dictionaryData, toIntExact(dataLength));
dictionaryOffsetVector = ensureCapacity(dictionaryOffsetVector, stripeDictionarySize + 2);
// read dictionary values
ByteArrayInputStream dictionaryDataStream = stripeDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, stripeDictionarySize, stripeDictionaryLength, 0, dictionaryData, dictionaryOffsetVector, maxCodePointCount, isCharType);
} else {
dictionaryData = EMPTY_DICTIONARY_DATA;
dictionaryOffsetVector = EMPTY_DICTIONARY_OFFSETS;
}
// If there is no rowgroup dictionary, we only need to wrap the stripe dictionary once per stripe because wrapping dictionary is very expensive.
dictionaryWrapped = false;
}
// read row group dictionary
RowGroupDictionaryLengthInputStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream();
if (dictionaryLengthStream != null) {
int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount();
rowGroupDictionaryLength = ensureCapacity(rowGroupDictionaryLength, rowGroupDictionarySize);
// read the lengths
dictionaryLengthStream.nextIntVector(rowGroupDictionarySize, rowGroupDictionaryLength, 0);
long dataLength = 0;
for (int i = 0; i < rowGroupDictionarySize; i++) {
dataLength += rowGroupDictionaryLength[i];
}
dictionaryData = ensureCapacity(dictionaryData, dictionaryOffsetVector[stripeDictionarySize] + toIntExact(dataLength), MEDIUM, PRESERVE);
dictionaryOffsetVector = ensureCapacity(dictionaryOffsetVector, stripeDictionarySize + rowGroupDictionarySize + 2, MEDIUM, PRESERVE);
dictionaryWrapped = false;
// read dictionary values
ByteArrayInputStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, stripeDictionarySize, dictionaryData, dictionaryOffsetVector, maxCodePointCount, isCharType);
currentDictionarySize = stripeDictionarySize + rowGroupDictionarySize + 1;
initiateEvaluationStatus(stripeDictionarySize + rowGroupDictionarySize + 1);
} else {
// there is no row group dictionary so use the stripe dictionary
currentDictionarySize = stripeDictionarySize + 1;
initiateEvaluationStatus(stripeDictionarySize + 1);
}
dictionaryOffsetVector[currentDictionarySize] = dictionaryOffsetVector[currentDictionarySize - 1];
stripeDictionaryOpen = true;
presentStream = presentStreamSource.openStream();
inDictionaryStream = inDictionaryStreamSource.openStream();
dataStream = dataStreamSource.openStream();
rowGroupOpen = true;
}
use of com.facebook.presto.orc.stream.ByteArrayInputStream in project presto by prestodb.
the class SliceDictionaryBatchStreamReader method openRowGroup.
private void openRowGroup() throws IOException {
// read the dictionary
if (!stripeDictionaryOpen) {
if (stripeDictionarySize > 0) {
// resize the dictionary lengths array if necessary
if (stripeDictionaryLength.length < stripeDictionarySize) {
stripeDictionaryLength = new int[stripeDictionarySize];
systemMemoryContext.setBytes(sizeOf(stripeDictionaryLength));
}
// read the lengths
LongInputStream lengthStream = stripeDictionaryLengthStreamSource.openStream();
if (lengthStream == null) {
throw new OrcCorruptionException(streamDescriptor.getOrcDataSourceId(), "Dictionary is not empty but dictionary length stream is not present");
}
lengthStream.next(stripeDictionaryLength, stripeDictionarySize);
long dataLength = 0;
for (int i = 0; i < stripeDictionarySize; i++) {
dataLength += stripeDictionaryLength[i];
}
// we must always create a new dictionary array because the previous dictionary may still be referenced
stripeDictionaryData = new byte[toIntExact(dataLength)];
systemMemoryContext.setBytes(sizeOf(stripeDictionaryData));
// add one extra entry for null
stripeDictionaryOffsetVector = new int[stripeDictionarySize + 2];
systemMemoryContext.setBytes(sizeOf(stripeDictionaryOffsetVector));
// read dictionary values
ByteArrayInputStream dictionaryDataStream = stripeDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, stripeDictionarySize, stripeDictionaryLength, 0, stripeDictionaryData, stripeDictionaryOffsetVector, maxCodePointCount, isCharType);
} else {
stripeDictionaryData = EMPTY_DICTIONARY_DATA;
stripeDictionaryOffsetVector = EMPTY_DICTIONARY_OFFSETS;
}
}
stripeDictionaryOpen = true;
// read row group dictionary
RowGroupDictionaryLengthInputStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream();
if (dictionaryLengthStream != null) {
int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount();
// resize the dictionary lengths array if necessary
if (rowGroupDictionaryLength.length < rowGroupDictionarySize) {
rowGroupDictionaryLength = new int[rowGroupDictionarySize];
}
// read the lengths
dictionaryLengthStream.next(rowGroupDictionaryLength, rowGroupDictionarySize);
long dataLength = 0;
for (int i = 0; i < rowGroupDictionarySize; i++) {
dataLength += rowGroupDictionaryLength[i];
}
// We must always create a new dictionary array because the previous dictionary may still be referenced
// The first elements of the dictionary are from the stripe dictionary, then the row group dictionary elements, and then a null
byte[] rowGroupDictionaryData = Arrays.copyOf(stripeDictionaryData, stripeDictionaryOffsetVector[stripeDictionarySize] + toIntExact(dataLength));
int[] rowGroupDictionaryOffsetVector = Arrays.copyOf(stripeDictionaryOffsetVector, stripeDictionarySize + rowGroupDictionarySize + 2);
// read dictionary values
ByteArrayInputStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream();
readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, stripeDictionarySize, rowGroupDictionaryData, rowGroupDictionaryOffsetVector, maxCodePointCount, isCharType);
setDictionaryBlockData(rowGroupDictionaryData, rowGroupDictionaryOffsetVector, stripeDictionarySize + rowGroupDictionarySize + 1);
} else {
// there is no row group dictionary so use the stripe dictionary
setDictionaryBlockData(stripeDictionaryData, stripeDictionaryOffsetVector, stripeDictionarySize + 1);
}
presentStream = presentStreamSource.openStream();
inDictionaryStream = inDictionaryStreamSource.openStream();
dataStream = dataStreamSource.openStream();
rowGroupOpen = true;
}
Aggregations