use of io.trino.plugin.hive.FileWriter in project trino by trinodb.
the class OrcFileWriterFactory method createFileWriter.
@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
if (!OrcOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
return Optional.empty();
}
CompressionKind compression = getCompression(schema, configuration);
// existing tables and partitions may have columns in a different order than the writer is providing, so build
// an index to rearrange columns in the proper order
List<String> fileColumnNames = getColumnNames(schema);
List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
if (transaction.isAcidDeleteOperation(writerKind)) {
// For delete, set the "row" column to -1
fileInputColumnIndexes[fileInputColumnIndexes.length - 1] = -1;
}
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
OrcDataSink orcDataSink = createOrcDataSink(fileSystem, path);
Optional<Supplier<OrcDataSource>> validationInputFactory = Optional.empty();
if (isOrcOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsOrcDataSource(new OrcDataSourceId(path.toString()), fileSystem.getFileStatus(path).getLen(), new OrcReaderOptions(), fileSystem.open(path), readStats);
} catch (IOException e) {
throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
});
}
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
if (transaction.isInsert() && useAcidSchema) {
// Only add the ACID columns if the request is for insert-type operations - - for delete operations,
// the columns are added by the caller. This is because the ACID columns for delete operations
// depend on the rows being deleted, whereas the ACID columns for INSERT are completely determined
// by bucket and writeId.
Type rowType = createRowType(fileColumnNames, fileColumnTypes);
fileColumnNames = ACID_COLUMN_NAMES;
fileColumnTypes = createAcidColumnPrestoTypes(rowType);
}
return Optional.of(new OrcFileWriter(orcDataSink, writerKind, transaction, useAcidSchema, bucketNumber, rollbackAction, fileColumnNames, fileColumnTypes, createRootOrcType(fileColumnNames, fileColumnTypes), compression, getOrcWriterOptions(schema, orcWriterOptions).withStripeMinSize(getOrcOptimizedWriterMinStripeSize(session)).withStripeMaxSize(getOrcOptimizedWriterMaxStripeSize(session)).withStripeMaxRowCount(getOrcOptimizedWriterMaxStripeRows(session)).withDictionaryMaxMemory(getOrcOptimizedWriterMaxDictionaryMemory(session)).withMaxStringStatisticsLimit(getOrcStringStatisticsLimit(session)), fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory, getOrcOptimizedWriterValidateMode(session), stats));
} catch (IOException e) {
throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating ORC file", e);
}
}
use of io.trino.plugin.hive.FileWriter in project trino by trinodb.
the class ParquetFileWriterFactory method createFileWriter.
@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf conf, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
if (!HiveSessionProperties.isParquetOptimizedWriterEnabled(session)) {
return Optional.empty();
}
if (!MapredParquetOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
return Optional.empty();
}
ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxPageSize(HiveSessionProperties.getParquetWriterPageSize(session)).setMaxBlockSize(HiveSessionProperties.getParquetWriterBlockSize(session)).setBatchSize(HiveSessionProperties.getParquetBatchSize(session)).build();
CompressionCodecName compressionCodecName = getCompression(conf);
List<String> fileColumnNames = getColumnNames(schema);
List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(fileColumnTypes, fileColumnNames);
return Optional.of(new ParquetFileWriter(fileSystem.create(path, false), rollbackAction, fileColumnTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, fileInputColumnIndexes, compressionCodecName, nodeVersion.toString()));
} catch (IOException e) {
throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating Parquet file", e);
}
}
use of io.trino.plugin.hive.FileWriter in project trino by trinodb.
the class DeltaLakePageSink method getWriterIndexes.
private int[] getWriterIndexes(Page page) {
Page partitionColumns = extractColumns(page, partitionColumnsInputIndex);
int[] writerIndexes = pageIndexer.indexPage(partitionColumns);
if (pageIndexer.getMaxIndex() >= maxOpenWriters) {
throw new TrinoException(DELTA_LAKE_BAD_WRITE, format("Exceeded limit of %s open writers for partitions", maxOpenWriters));
}
// expand writers list to new size
while (writers.size() <= pageIndexer.getMaxIndex()) {
writers.add(null);
}
// create missing writers
for (int position = 0; position < page.getPositionCount(); position++) {
int writerIndex = writerIndexes[position];
if (writers.get(writerIndex) != null) {
continue;
}
Path filePath = new Path(outputPath);
List<String> partitionValues = createPartitionValues(partitionColumnTypes, partitionColumns, position);
Optional<String> partitionName = Optional.empty();
if (!originalPartitionColumnNames.isEmpty()) {
String partName = makePartName(originalPartitionColumnNames, partitionValues);
filePath = new Path(outputPath, partName);
partitionName = Optional.of(partName);
}
String fileName = randomUUID().toString();
filePath = new Path(filePath, fileName);
FileWriter fileWriter;
if (isParquetOptimizedWriterEnabled(session)) {
fileWriter = createParquetFileWriter(filePath);
} else {
fileWriter = createRecordFileWriter(filePath);
}
Path rootTableLocation = new Path(outputPath);
try {
DeltaLakeWriter writer = new DeltaLakeWriter(hdfsEnvironment.getFileSystem(session.getIdentity(), rootTableLocation, conf), fileWriter, rootTableLocation, partitionName.map(partition -> new Path(partition, fileName).toString()).orElse(fileName), partitionValues, stats, dataColumnHandles);
writers.set(writerIndex, writer);
} catch (IOException e) {
throw new TrinoException(DELTA_LAKE_BAD_WRITE, "Unable to create writer for location: " + outputPath, e);
}
}
verify(writers.size() == pageIndexer.getMaxIndex() + 1);
verify(!writers.contains(null));
return writerIndexes;
}
use of io.trino.plugin.hive.FileWriter in project trino by trinodb.
the class DeltaLakePageSink method createParquetFileWriter.
private FileWriter createParquetFileWriter(Path path) {
ParquetWriterOptions parquetWriterOptions = ParquetWriterOptions.builder().setMaxBlockSize(getParquetWriterBlockSize(session)).setMaxPageSize(getParquetWriterPageSize(session)).build();
CompressionCodecName compressionCodecName = getCompressionCodec(session).getParquetCompressionCodec();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, conf);
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
List<Type> parquetTypes = dataColumnTypes.stream().map(type -> {
if (type instanceof TimestampWithTimeZoneType) {
verify(((TimestampWithTimeZoneType) type).getPrecision() == 3, "Unsupported type: %s", type);
return TIMESTAMP_MILLIS;
}
return type;
}).collect(toImmutableList());
// we use identity column mapping; input page already contains only data columns per
// DataLagePageSink.getDataPage()
int[] identityMapping = new int[dataColumnTypes.size()];
for (int i = 0; i < identityMapping.length; ++i) {
identityMapping[i] = i;
}
ParquetSchemaConverter schemaConverter = new ParquetSchemaConverter(parquetTypes, dataColumnNames);
return new ParquetFileWriter(fileSystem.create(path), rollbackAction, parquetTypes, schemaConverter.getMessageType(), schemaConverter.getPrimitiveTypes(), parquetWriterOptions, identityMapping, compressionCodecName, trinoVersion);
} catch (IOException e) {
throw new TrinoException(DELTA_LAKE_BAD_WRITE, "Error creating Parquet file", e);
}
}
Aggregations