use of com.facebook.presto.orc.OrcEncoding.DWRF in project presto by prestodb.
the class TestDecryption method validateFileStatistics.
private static void validateFileStatistics(TempFile tempFile, Optional<DwrfWriterEncryption> dwrfWriterEncryption, Map<Integer, Slice> readerIntermediateKeys) throws IOException {
OrcReader readerNoKeys = OrcTester.createCustomOrcReader(tempFile, DWRF, false, ImmutableMap.of());
if (readerNoKeys.getFooter().getStripes().isEmpty()) {
// files w/o stripes don't have stats
assertEquals(readerNoKeys.getFooter().getFileStats().size(), 0);
return;
}
if (dwrfWriterEncryption.isPresent()) {
List<OrcType> types = readerNoKeys.getTypes();
List<ColumnStatistics> fileStatsNoKey = readerNoKeys.getFooter().getFileStats();
assertEquals(fileStatsNoKey.size(), types.size());
Set<Integer> allEncryptedNodes = dwrfWriterEncryption.get().getWriterEncryptionGroups().stream().flatMap(group -> group.getNodes().stream()).flatMap(node -> collectNodeTree(types, node).stream()).collect(Collectors.toSet());
for (Set<Integer> readerKeyNodes : Sets.powerSet(readerIntermediateKeys.keySet())) {
Map<Integer, Slice> readerKeys = new HashMap<>();
readerKeyNodes.forEach(node -> readerKeys.put(node, readerIntermediateKeys.get(node)));
// nodes that are supposed to be decrypted by the reader
Set<Integer> decryptedNodes = readerKeys.keySet().stream().flatMap(node -> collectNodeTree(types, node).stream()).collect(Collectors.toSet());
// decryptedNodes should be a subset of encrypted nodes
assertTrue(allEncryptedNodes.containsAll(decryptedNodes));
OrcReader readerWithKeys = OrcTester.createCustomOrcReader(tempFile, DWRF, false, readerIntermediateKeys);
List<ColumnStatistics> fileStatsWithKey = readerWithKeys.getFooter().getFileStats();
assertEquals(fileStatsWithKey.size(), types.size());
for (int node = 0; node < types.size(); node++) {
ColumnStatistics statsWithKey = fileStatsWithKey.get(node);
ColumnStatistics statsNoKey = fileStatsNoKey.get(node);
OrcType type = types.get(node);
// encrypted nodes should have no type info
if (allEncryptedNodes.contains(node)) {
assertTrue(hasNoTypeStats(statsNoKey));
} else {
assertStatsTypeMatch(statsNoKey, type);
assertStatsTypeMatch(statsWithKey, type);
assertEquals(statsNoKey, statsWithKey);
}
if (decryptedNodes.contains(node)) {
assertStatsTypeMatch(statsWithKey, type);
}
}
}
}
}
use of com.facebook.presto.orc.OrcEncoding.DWRF in project presto by prestodb.
the class OrcFileWriterFactory method createFileWriter.
@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
if (!HiveSessionProperties.isOrcOptimizedWriterEnabled(session)) {
return Optional.empty();
}
OrcEncoding orcEncoding;
if (OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = ORC;
} else if (com.facebook.hive.orc.OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = DWRF;
} else {
return Optional.empty();
}
CompressionKind compression = getCompression(schema, configuration, orcEncoding);
// existing tables and partitions may have columns in a different order than the writer is providing, so build
// an index to rearrange columns in the proper order
List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
DataSink dataSink = createDataSink(session, fileSystem, path);
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats);
} catch (IOException e) {
throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
});
}
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
Optional<DwrfWriterEncryption> dwrfWriterEncryption = createDwrfEncryption(encryptionInformation, fileColumnNames, fileColumnTypes);
return Optional.of(new OrcFileWriter(dataSink, rollbackAction, orcEncoding, fileColumnNames, fileColumnTypes, compression, orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)).withIgnoreDictionaryRowGroupSizes(isExecutionBasedMemoryAccountingEnabled(session)).withDwrfStripeCacheEnabled(isDwrfWriterStripeCacheEnabled(session)).withDwrfStripeCacheMaxSize(getDwrfWriterStripeCacheeMaxSize(session)).build(), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(MetastoreUtil.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), hiveStorageTimeZone, validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats, dwrfEncryptionProvider, dwrfWriterEncryption));
} catch (IOException e) {
throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating " + orcEncoding + " file. " + e.getMessage(), e);
}
}
Aggregations