use of io.trino.rcfile.RcFileDataSource in project trino by trinodb.
the class RcFileFileWriterFactory method createFileWriter.
@Override
public Optional<FileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, OptionalInt bucketNumber, AcidTransaction transaction, boolean useAcidSchema, WriterKind writerKind) {
if (!RCFileOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
return Optional.empty();
}
RcFileEncoding rcFileEncoding;
if (LazyBinaryColumnarSerDe.class.getName().equals(storageFormat.getSerde())) {
rcFileEncoding = new BinaryRcFileEncoding(timeZone);
} else if (ColumnarSerDe.class.getName().equals(storageFormat.getSerde())) {
rcFileEncoding = createTextVectorEncoding(schema);
} else {
return Optional.empty();
}
Optional<String> codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC));
// existing tables and partitions may have columns in a different order than the writer is providing, so build
// an index to rearrange columns in the proper order
List<String> fileColumnNames = getColumnNames(schema);
List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager, getTimestampPrecision(session))).collect(toList());
int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
OutputStream outputStream = fileSystem.create(path, false);
Optional<Supplier<RcFileDataSource>> validationInputFactory = Optional.empty();
if (isRcfileOptimizedWriterValidate(session)) {
validationInputFactory = Optional.of(() -> {
try {
return new HdfsRcFileDataSource(path.toString(), fileSystem.open(path), fileSystem.getFileStatus(path).getLen(), stats);
} catch (IOException e) {
throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
});
}
Callable<Void> rollbackAction = () -> {
fileSystem.delete(path, false);
return null;
};
return Optional.of(new RcFileFileWriter(outputStream, rollbackAction, rcFileEncoding, fileColumnTypes, codecName, fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(PRESTO_VERSION_NAME, nodeVersion.toString()).put(PRESTO_QUERY_ID_NAME, session.getQueryId()).buildOrThrow(), validationInputFactory));
} catch (Exception e) {
throw new TrinoException(HIVE_WRITER_OPEN_ERROR, "Error creating RCFile file", e);
}
}
use of io.trino.rcfile.RcFileDataSource in project trino by trinodb.
the class RcFilePageSourceFactory method createPageSource.
@Override
public Optional<ReaderPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long estimatedFileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction) {
RcFileEncoding rcFileEncoding;
String deserializerClassName = getDeserializerClassName(schema);
if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
rcFileEncoding = new BinaryRcFileEncoding(timeZone);
} else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
rcFileEncoding = createTextVectorEncoding(schema);
} else {
return Optional.empty();
}
checkArgument(acidInfo.isEmpty(), "Acid is not supported");
List<HiveColumnHandle> projectedReaderColumns = columns;
Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
if (readerProjections.isPresent()) {
projectedReaderColumns = readerProjections.get().get().stream().map(HiveColumnHandle.class::cast).collect(toImmutableList());
}
RcFileDataSource dataSource;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(path));
if (estimatedFileSize < BUFFER_SIZE.toBytes()) {
// Handle potentially imprecise file lengths by reading the footer
try {
FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(path.toString(), estimatedFileSize, inputStream, toIntExact(BUFFER_SIZE.toBytes()));
dataSource = new MemoryRcFileDataSource(new RcFileDataSourceId(path.toString()), fileTail.getTailSlice());
} finally {
inputStream.close();
}
} else {
long fileSize = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(path).getLen());
dataSource = new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats);
}
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
length = min(dataSource.getSize() - start, length);
// Split may be empty now that the correct file size is known
if (length <= 0) {
return Optional.of(noProjectionAdaptation(new EmptyPageSource()));
}
try {
ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
for (HiveColumnHandle column : projectedReaderColumns) {
readColumns.put(column.getBaseHiveColumnIndex(), column.getHiveType().getType(typeManager, timestampPrecision));
}
RcFileReader rcFileReader = new RcFileReader(dataSource, rcFileEncoding, readColumns.buildOrThrow(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, BUFFER_SIZE);
ConnectorPageSource pageSource = new RcFilePageSource(rcFileReader, projectedReaderColumns);
return Optional.of(new ReaderPageSource(pageSource, readerProjections));
} catch (Throwable e) {
try {
dataSource.close();
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof RcFileCorruptionException) {
throw new TrinoException(HIVE_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.rcfile.RcFileDataSource in project trino by trinodb.
the class RcFileFileWriter method commit.
@Override
public void commit() {
try {
rcFileWriter.close();
} catch (IOException | UncheckedIOException e) {
try {
rollbackAction.call();
} catch (Exception ignored) {
// ignore
}
throw new TrinoException(HIVE_WRITER_CLOSE_ERROR, "Error committing write to Hive", e);
}
if (validationInputFactory.isPresent()) {
try {
try (RcFileDataSource input = validationInputFactory.get().get()) {
long startThreadCpuTime = THREAD_MX_BEAN.getCurrentThreadCpuTime();
rcFileWriter.validate(input);
validationCpuNanos += THREAD_MX_BEAN.getCurrentThreadCpuTime() - startThreadCpuTime;
}
} catch (IOException | UncheckedIOException e) {
throw new TrinoException(HIVE_WRITE_VALIDATION_FAILED, e);
}
}
}
Aggregations