use of com.facebook.presto.common.type.TypeManager in project presto by prestodb.
the class OrcFileWriterFactory method createFileWriter.
@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<EncryptionInformation> encryptionInformation) {
if (!HiveSessionProperties.isOrcOptimizedWriterEnabled(session)) {
return Optional.empty();
}
OrcEncoding orcEncoding;
if (OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = ORC;
} else if (com.facebook.hive.orc.OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
orcEncoding = DWRF;
} else {
return Optional.empty();
}
CompressionKind compression = getCompression(schema, configuration, orcEncoding);
// existing tables and partitions may have columns in a different order than the writer is providing, so build
// an index to rearrange columns in the proper order
List<String> fileColumnNames = Splitter.on(',').trimResults().omitEmptyStrings().splitToList(schema.getProperty(META_TABLE_COLUMNS, ""));
List<Type> fileColumnTypes = toHiveTypes(schema.getProperty(META_TABLE_COLUMN_TYPES, "")).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
DataSink dataSink = createDataSink(session, fileSystem, path);
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (HiveSessionProperties.isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), getOrcMaxMergeDistance(session), getOrcMaxBufferSize(session), getOrcStreamBufferSize(session), false, fileSystem.open(path), readStats);
} catch (IOException e) {
throw new PrestoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
});
}
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
Optional<DwrfWriterEncryption> dwrfWriterEncryption = createDwrfEncryption(encryptionInformation, fileColumnNames, fileColumnTypes);
return Optional.of(new OrcFileWriter(dataSink, rollbackAction, orcEncoding, fileColumnNames, fileColumnTypes, compression, orcFileWriterConfig.toOrcWriterOptionsBuilder().withFlushPolicy(DefaultOrcWriterFlushPolicy.builder().withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).build()).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)).withIgnoreDictionaryRowGroupSizes(isExecutionBasedMemoryAccountingEnabled(session)).withDwrfStripeCacheEnabled(isDwrfWriterStripeCacheEnabled(session)).withDwrfStripeCacheMaxSize(getDwrfWriterStripeCacheeMaxSize(session)).build(), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(MetastoreUtil.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), hiveStorageTimeZone, validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats, dwrfEncryptionProvider, dwrfWriterEncryption));
} catch (IOException e) {
throw new PrestoException(HIVE_WRITER_OPEN_ERROR, "Error creating " + orcEncoding + " file. " + e.getMessage(), e);
}
}
use of com.facebook.presto.common.type.TypeManager in project presto by prestodb.
the class OrcStorageManager method toOrcFileType.
static Type toOrcFileType(Type raptorType, TypeManager typeManager) {
// TIMESTAMPS are stored as BIGINT to void the poor encoding in ORC
if (raptorType == TimestampType.TIMESTAMP) {
return BIGINT;
}
if (raptorType instanceof ArrayType) {
Type elementType = toOrcFileType(((ArrayType) raptorType).getElementType(), typeManager);
return new ArrayType(elementType);
}
if (raptorType instanceof MapType) {
TypeSignature keyType = toOrcFileType(((MapType) raptorType).getKeyType(), typeManager).getTypeSignature();
TypeSignature valueType = toOrcFileType(((MapType) raptorType).getValueType(), typeManager).getTypeSignature();
return typeManager.getParameterizedType(StandardTypes.MAP, ImmutableList.of(TypeSignatureParameter.of(keyType), TypeSignatureParameter.of(valueType)));
}
if (raptorType instanceof RowType) {
List<RowType.Field> fields = ((RowType) raptorType).getFields().stream().map(field -> new RowType.Field(field.getName(), toOrcFileType(field.getType(), typeManager))).collect(toImmutableList());
return RowType.from(fields);
}
return raptorType;
}
use of com.facebook.presto.common.type.TypeManager in project presto by prestodb.
the class OrcFileRewriter method rewrite.
public OrcFileInfo rewrite(FileSystem fileSystem, Map<String, Type> allColumnTypes, Path input, Path output, BitSet rowsToDelete) throws IOException {
try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader());
OrcDataSource dataSource = orcDataEnvironment.createOrcDataSource(fileSystem, input, readerAttributes)) {
OrcReader reader = new OrcReader(dataSource, ORC, orcFileTailSource, stripeMetadataSourceFactory, new RaptorOrcAggregatedMemoryContext(), new OrcReaderOptions(readerAttributes.getMaxMergeDistance(), readerAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE, readerAttributes.isZstdJniDecompressionEnabled()), false, NO_ENCRYPTION, DwrfKeyProvider.EMPTY, new RuntimeStats());
if (reader.getFooter().getNumberOfRows() < rowsToDelete.length()) {
throw new IOException("File has fewer rows than deletion vector");
}
int deleteRowCount = rowsToDelete.cardinality();
if (reader.getFooter().getNumberOfRows() == deleteRowCount) {
return new OrcFileInfo(0, 0);
}
if (reader.getFooter().getNumberOfRows() >= Integer.MAX_VALUE) {
throw new IOException("File has too many rows");
}
int inputRowCount = toIntExact(reader.getFooter().getNumberOfRows());
Map<String, Integer> currentColumnIds = IntStream.range(0, reader.getColumnNames().size()).boxed().collect(toMap(reader.getColumnNames()::get, i -> i));
ImmutableList.Builder<Type> writerColumnTypesBuilder = ImmutableList.builder();
ImmutableList.Builder<String> writerColumnIdsBuilder = ImmutableList.builder();
ImmutableList.Builder<Integer> readerColumnIndexBuilder = ImmutableList.builder();
// Build columns for writer; keep the right ordinal
Map<String, Type> orderedAllColumnTypes = new TreeMap<>(Comparator.comparingLong(Long::parseLong));
orderedAllColumnTypes.putAll(allColumnTypes);
for (Map.Entry<String, Type> columnType : orderedAllColumnTypes.entrySet()) {
// Get the intersection of the provide columns and the actual columns
Integer currentColumnIndex = currentColumnIds.get(columnType.getKey());
if (currentColumnIndex != null) {
readerColumnIndexBuilder.add(currentColumnIndex);
writerColumnTypesBuilder.add(columnType.getValue());
writerColumnIdsBuilder.add(columnType.getKey());
}
}
List<Type> writerColumnTypes = writerColumnTypesBuilder.build();
List<String> writerColumnIds = writerColumnIdsBuilder.build();
List<Integer> readerColumnIndex = readerColumnIndexBuilder.build();
Map<Integer, Type> readerColumns = IntStream.range(0, readerColumnIndex.size()).boxed().collect(toMap(readerColumnIndex::get, writerColumnTypes::get));
if (writerColumnTypes.isEmpty()) {
// no intersection; directly return
return new OrcFileInfo(0, 0);
}
StorageTypeConverter converter = new StorageTypeConverter(typeManager);
List<Type> writerStorageTypes = writerColumnTypes.stream().map(converter::toStorageType).collect(toImmutableList());
long start = System.nanoTime();
Map<String, String> userMetadata = ImmutableMap.of();
if (reader.getFooter().getUserMetadata().containsKey(OrcFileMetadata.KEY)) {
// build metadata if the original file has it
ImmutableMap.Builder<Long, TypeSignature> metadataBuilder = ImmutableMap.builder();
for (int i = 0; i < writerColumnIds.size(); i++) {
metadataBuilder.put(Long.parseLong(writerColumnIds.get(i)), writerColumnTypes.get(i).getTypeSignature());
}
userMetadata = ImmutableMap.of(OrcFileMetadata.KEY, METADATA_CODEC.toJson(new OrcFileMetadata(metadataBuilder.build())));
}
StorageTypeConverter storageTypeConverter = new StorageTypeConverter(typeManager);
try (Closer<OrcBatchRecordReader, IOException> recordReader = closer(reader.createBatchRecordReader(storageTypeConverter.toStorageTypes(readerColumns), TRUE, DEFAULT_STORAGE_TIMEZONE, new RaptorOrcAggregatedMemoryContext(), INITIAL_BATCH_SIZE), OrcBatchRecordReader::close);
Closer<OrcWriter, IOException> writer = closer(new OrcWriter(orcDataEnvironment.createOrcDataSink(fileSystem, output), writerColumnIds, writerStorageTypes, ORC, compression, Optional.empty(), NO_ENCRYPTION, getDefaultOrcWriterOptions(), userMetadata, DEFAULT_STORAGE_TIMEZONE, validate, HASHED, stats), OrcWriter::close)) {
OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, writerColumnTypes, readerColumnIndexBuilder.build());
log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
return fileInfo;
}
} catch (NotSupportedException e) {
throw new PrestoException(NOT_SUPPORTED, e.getMessage(), e);
}
}
Aggregations