use of com.facebook.presto.orc.reader.SelectiveStreamReader in project presto by prestodb.
the class TestOrcSelectiveStreamReaders method testEmptyStrings.
/**
* This test tests SliceDirectSelectiveStreamReader for the case where all elements to read are empty strings. The output Block should be a valid VariableWidthBlock with an
* empty Slice. It is to simulate a problem seen in production. The state of SliceDirectSelectiveStreamReader to reproduce the problem is:
* - dataStream: null
* - presentStream: null
* - lengthStream: not null
* - filter: null
* - outputRequired: true
* - offsets array: non zeros
* The test issues two reads, the first one reads a non-empty string and populates non-zero offsets. The second one reads the empty string with the above conditions met.
*/
@Test
public void testEmptyStrings() throws Exception {
Type type = VARCHAR;
List<Type> types = ImmutableList.of(type);
List<List<?>> values = ImmutableList.of(ImmutableList.of("a", ""));
for (OrcTester.Format format : formats) {
if (!types.stream().allMatch(readType -> format.supportsType(readType))) {
return;
}
for (CompressionKind compression : compressions) {
TempFile tempFile = new TempFile();
writeOrcColumnsPresto(tempFile.getFile(), format, compression, Optional.empty(), types, values, new OrcWriterStats());
OrcPredicate orcPredicate = createOrcPredicate(types, values, DWRF, false);
Map<Integer, Type> includedColumns = IntStream.range(0, types.size()).boxed().collect(toImmutableMap(Function.identity(), types::get));
List<Integer> outputColumns = IntStream.range(0, types.size()).boxed().collect(toImmutableList());
OrcAggregatedMemoryContext systemMemoryUsage = new TestingHiveOrcAggregatedMemoryContext();
try (OrcSelectiveRecordReader recordReader = createCustomOrcSelectiveRecordReader(tempFile.getFile(), format.getOrcEncoding(), orcPredicate, types, 1, ImmutableMap.of(), ImmutableList.of(), ImmutableMap.of(), OrcTester.OrcReaderSettings.builder().build().getRequiredSubfields(), ImmutableMap.of(), ImmutableMap.of(), includedColumns, outputColumns, false, systemMemoryUsage, false)) {
assertEquals(recordReader.getReaderPosition(), 0);
assertEquals(recordReader.getFilePosition(), 0);
SelectiveStreamReader streamReader = recordReader.getStreamReaders()[0];
// Read the first non-empty element. Do not call streamReader.getBlock() to preserve the offsets array in SliceDirectSelectiveStreamReader.
int batchSize = min(recordReader.prepareNextBatch(), 1);
int[] positions = IntStream.range(0, batchSize).toArray();
streamReader.read(0, positions, batchSize);
recordReader.batchRead(batchSize);
// Read the second element: an empty string. Set the dataStream in SliceDirectSelectiveStreamReader to null to simulate the conditions causing the problem.
((SliceSelectiveStreamReader) streamReader).resetDataStream();
batchSize = min(recordReader.prepareNextBatch(), 1);
positions = IntStream.range(0, batchSize).toArray();
streamReader.read(0, positions, batchSize);
recordReader.batchRead(batchSize);
Block block = streamReader.getBlock(positions, batchSize);
List<?> expectedValues = ImmutableList.of("");
assertBlockEquals(type, block, expectedValues, 0);
assertEquals(recordReader.getReaderPosition(), 1);
assertEquals(recordReader.getFilePosition(), 1);
}
}
}
}
use of com.facebook.presto.orc.reader.SelectiveStreamReader in project presto by prestodb.
the class OrcSelectiveRecordReader method createStreamReaders.
private static SelectiveStreamReader[] createStreamReaders(OrcDataSource orcDataSource, List<OrcType> types, DateTimeZone hiveStorageTimeZone, OrcRecordReaderOptions options, boolean legacyMapSubscript, Map<Integer, Type> includedColumns, List<Integer> outputColumns, Map<Integer, Map<Subfield, TupleDomainFilter>> filters, List<FilterFunction> filterFunctions, Map<Integer, Integer> filterFunctionInputMapping, Map<Integer, List<Subfield>> requiredSubfields, OrcAggregatedMemoryContext systemMemoryContext) {
List<StreamDescriptor> streamDescriptors = createStreamDescriptor("", "", 0, types, orcDataSource).getNestedStreams();
requireNonNull(filterFunctions, "filterFunctions is null");
requireNonNull(filterFunctionInputMapping, "filterFunctionInputMapping is null");
Set<Integer> filterFunctionInputColumns = filterFunctions.stream().flatMapToInt(function -> Arrays.stream(function.getInputChannels())).boxed().map(filterFunctionInputMapping::get).collect(toImmutableSet());
OrcType rowType = types.get(0);
SelectiveStreamReader[] streamReaders = new SelectiveStreamReader[rowType.getFieldCount()];
for (int columnId = 0; columnId < rowType.getFieldCount(); columnId++) {
if (includedColumns.containsKey(columnId)) {
StreamDescriptor streamDescriptor = streamDescriptors.get(columnId);
boolean outputRequired = outputColumns.contains(columnId) || filterFunctionInputColumns.contains(columnId);
streamReaders[columnId] = createStreamReader(streamDescriptor, Optional.ofNullable(filters.get(columnId)).orElse(ImmutableMap.of()), outputRequired ? Optional.of(includedColumns.get(columnId)) : Optional.empty(), Optional.ofNullable(requiredSubfields.get(columnId)).orElse(ImmutableList.of()), hiveStorageTimeZone, options, legacyMapSubscript, systemMemoryContext);
}
}
return streamReaders;
}
use of com.facebook.presto.orc.reader.SelectiveStreamReader in project presto by prestodb.
the class OrcSelectiveRecordReader method getNextPage.
public Page getNextPage() throws IOException {
if (constantFilterIsFalse) {
return null;
}
int batchSize = prepareNextBatch();
if (batchSize < 0) {
return null;
}
readPositions += batchSize;
initializePositions(batchSize);
int[] positionsToRead = this.positions;
int positionCount = batchSize;
if (filterFunctionWithoutInput.isPresent()) {
positionCount = applyFilterFunctionWithNoInputs(positionCount);
if (positionCount == 0) {
batchRead(batchSize);
return EMPTY_PAGE;
}
positionsToRead = outputPositions;
}
if (!filterFunctionsWithConstantInputs.isEmpty()) {
positionCount = applyFilterFunctions(filterFunctionsWithConstantInputs, filterFunctionConstantInputs, positionsToRead, positionCount);
if (positionCount == 0) {
batchRead(batchSize);
return EMPTY_PAGE;
}
positionsToRead = outputPositions;
}
int offset = getNextRowInGroup();
if (reorderFilters && offset >= MAX_BATCH_SIZE) {
reorderFiltersIfNeeded();
}
for (int i = 0; i < streamReaderOrder.length; i++) {
int columnIndex = streamReaderOrder[i];
if (!hasAnyFilter(columnIndex)) {
break;
}
SelectiveStreamReader streamReader = getStreamReader(columnIndex);
positionCount = streamReader.read(offset, positionsToRead, positionCount);
if (positionCount == 0) {
break;
}
positionsToRead = streamReader.getReadPositions();
verify(positionCount == 1 || positionsToRead[positionCount - 1] - positionsToRead[0] >= positionCount - 1, "positions must monotonically increase");
if (filterFunctionsOrder[i] != null) {
positionCount = applyFilterFunctions(filterFunctionsOrder[i], filterFunctionInputs[i], positionsToRead, positionCount);
if (positionCount == 0) {
break;
}
positionsToRead = outputPositions;
}
}
localMemoryContext.setBytes(getSelfRetainedSizeInBytes());
batchRead(batchSize);
if (positionCount == 0) {
return EMPTY_PAGE;
}
if (constantFilterError != null) {
throw constantFilterError;
}
for (int i = 0; i < positionCount; i++) {
if (errors[positionsToRead[i]] != null) {
throw errors[positionsToRead[i]];
}
}
for (SelectiveStreamReader reader : getStreamReaders()) {
if (reader != null) {
reader.throwAnyError(positionsToRead, positionCount);
}
}
Block[] blocks = new Block[appendRowNumber ? outputColumns.size() + 1 : outputColumns.size()];
for (int i = 0; i < outputColumns.size(); i++) {
int columnIndex = outputColumns.get(i);
if (constantValues[columnIndex] != null) {
blocks[i] = RunLengthEncodedBlock.create(columnTypes.get(columnIndex), constantValues[columnIndex] == NULL_MARKER ? null : constantValues[columnIndex], positionCount);
} else if (!hasAnyFilter(columnIndex)) {
blocks[i] = new LazyBlock(positionCount, new OrcBlockLoader(columnIndex, offset, positionsToRead, positionCount));
} else {
Block block = getStreamReader(columnIndex).getBlock(positionsToRead, positionCount);
updateMaxCombinedBytesPerRow(hiveColumnIndices[columnIndex], block);
if (coercers[columnIndex] != null) {
block = coercers[columnIndex].apply(block);
}
blocks[i] = block;
}
}
if (appendRowNumber) {
blocks[outputColumns.size()] = createRowNumbersBlock(positionsToRead, positionCount, this.getFilePosition());
}
Page page = new Page(positionCount, blocks);
validateWritePageChecksum(page);
return page;
}
Aggregations