Search in sources :

Example 1 with HadoopCodecFactory

use of io.trino.rcfile.HadoopCodecFactory in project trino by trinodb.

the class RcFilePageSourceFactory method createPageSource.

@Override
public Optional<ReaderPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long estimatedFileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction) {
    RcFileEncoding rcFileEncoding;
    String deserializerClassName = getDeserializerClassName(schema);
    if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
        rcFileEncoding = new BinaryRcFileEncoding(timeZone);
    } else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
        rcFileEncoding = createTextVectorEncoding(schema);
    } else {
        return Optional.empty();
    }
    checkArgument(acidInfo.isEmpty(), "Acid is not supported");
    List<HiveColumnHandle> projectedReaderColumns = columns;
    Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
    if (readerProjections.isPresent()) {
        projectedReaderColumns = readerProjections.get().get().stream().map(HiveColumnHandle.class::cast).collect(toImmutableList());
    }
    RcFileDataSource dataSource;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(path));
        if (estimatedFileSize < BUFFER_SIZE.toBytes()) {
            // Handle potentially imprecise file lengths by reading the footer
            try {
                FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(path.toString(), estimatedFileSize, inputStream, toIntExact(BUFFER_SIZE.toBytes()));
                dataSource = new MemoryRcFileDataSource(new RcFileDataSourceId(path.toString()), fileTail.getTailSlice());
            } finally {
                inputStream.close();
            }
        } else {
            long fileSize = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(path).getLen());
            dataSource = new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats);
        }
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    length = min(dataSource.getSize() - start, length);
    // Split may be empty now that the correct file size is known
    if (length <= 0) {
        return Optional.of(noProjectionAdaptation(new EmptyPageSource()));
    }
    try {
        ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
        HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
        for (HiveColumnHandle column : projectedReaderColumns) {
            readColumns.put(column.getBaseHiveColumnIndex(), column.getHiveType().getType(typeManager, timestampPrecision));
        }
        RcFileReader rcFileReader = new RcFileReader(dataSource, rcFileEncoding, readColumns.buildOrThrow(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, BUFFER_SIZE);
        ConnectorPageSource pageSource = new RcFilePageSource(rcFileReader, projectedReaderColumns);
        return Optional.of(new ReaderPageSource(pageSource, readerProjections));
    } catch (Throwable e) {
        try {
            dataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof RcFileCorruptionException) {
            throw new TrinoException(HIVE_BAD_DATA, message, e);
        }
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) BinaryRcFileEncoding(io.trino.rcfile.binary.BinaryRcFileEncoding) TextRcFileEncoding(io.trino.rcfile.text.TextRcFileEncoding) RcFileEncoding(io.trino.rcfile.RcFileEncoding) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) RcFileCorruptionException(io.trino.rcfile.RcFileCorruptionException) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) RcFileDataSourceId(io.trino.rcfile.RcFileDataSourceId) HiveTimestampPrecision(io.trino.plugin.hive.HiveTimestampPrecision) IOException(java.io.IOException) FSDataInputStreamTail(io.trino.plugin.hive.util.FSDataInputStreamTail) RcFileReader(io.trino.rcfile.RcFileReader) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) RcFileCorruptionException(io.trino.rcfile.RcFileCorruptionException) ImmutableMap(com.google.common.collect.ImmutableMap) Type(io.trino.spi.type.Type) MemoryRcFileDataSource(io.trino.rcfile.MemoryRcFileDataSource) HadoopCodecFactory(io.trino.rcfile.HadoopCodecFactory) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns) BinaryRcFileEncoding(io.trino.rcfile.binary.BinaryRcFileEncoding) AircompressorCodecFactory(io.trino.rcfile.AircompressorCodecFactory) RcFileDataSource(io.trino.rcfile.RcFileDataSource) MemoryRcFileDataSource(io.trino.rcfile.MemoryRcFileDataSource)

Aggregations

ImmutableMap (com.google.common.collect.ImmutableMap)1 HiveColumnHandle (io.trino.plugin.hive.HiveColumnHandle)1 HiveTimestampPrecision (io.trino.plugin.hive.HiveTimestampPrecision)1 ReaderColumns (io.trino.plugin.hive.ReaderColumns)1 ReaderPageSource (io.trino.plugin.hive.ReaderPageSource)1 FSDataInputStreamTail (io.trino.plugin.hive.util.FSDataInputStreamTail)1 AircompressorCodecFactory (io.trino.rcfile.AircompressorCodecFactory)1 HadoopCodecFactory (io.trino.rcfile.HadoopCodecFactory)1 MemoryRcFileDataSource (io.trino.rcfile.MemoryRcFileDataSource)1 RcFileCorruptionException (io.trino.rcfile.RcFileCorruptionException)1 RcFileDataSource (io.trino.rcfile.RcFileDataSource)1 RcFileDataSourceId (io.trino.rcfile.RcFileDataSourceId)1 RcFileEncoding (io.trino.rcfile.RcFileEncoding)1 RcFileReader (io.trino.rcfile.RcFileReader)1 BinaryRcFileEncoding (io.trino.rcfile.binary.BinaryRcFileEncoding)1 TextRcFileEncoding (io.trino.rcfile.text.TextRcFileEncoding)1 TrinoException (io.trino.spi.TrinoException)1 ConnectorPageSource (io.trino.spi.connector.ConnectorPageSource)1 EmptyPageSource (io.trino.spi.connector.EmptyPageSource)1 Type (io.trino.spi.type.Type)1