Search in sources :

Example 1 with FSDataInputStreamTail

use of io.trino.plugin.hive.util.FSDataInputStreamTail in project trino by trinodb.

the class RcFilePageSourceFactory method createPageSource.

@Override
public Optional<ReaderPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long estimatedFileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction) {
    RcFileEncoding rcFileEncoding;
    String deserializerClassName = getDeserializerClassName(schema);
    if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
        rcFileEncoding = new BinaryRcFileEncoding(timeZone);
    } else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
        rcFileEncoding = createTextVectorEncoding(schema);
    } else {
        return Optional.empty();
    }
    checkArgument(acidInfo.isEmpty(), "Acid is not supported");
    List<HiveColumnHandle> projectedReaderColumns = columns;
    Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
    if (readerProjections.isPresent()) {
        projectedReaderColumns = readerProjections.get().get().stream().map(HiveColumnHandle.class::cast).collect(toImmutableList());
    }
    RcFileDataSource dataSource;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(path));
        if (estimatedFileSize < BUFFER_SIZE.toBytes()) {
            // Handle potentially imprecise file lengths by reading the footer
            try {
                FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(path.toString(), estimatedFileSize, inputStream, toIntExact(BUFFER_SIZE.toBytes()));
                dataSource = new MemoryRcFileDataSource(new RcFileDataSourceId(path.toString()), fileTail.getTailSlice());
            } finally {
                inputStream.close();
            }
        } else {
            long fileSize = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(path).getLen());
            dataSource = new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats);
        }
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    length = min(dataSource.getSize() - start, length);
    // Split may be empty now that the correct file size is known
    if (length <= 0) {
        return Optional.of(noProjectionAdaptation(new EmptyPageSource()));
    }
    try {
        ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
        HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
        for (HiveColumnHandle column : projectedReaderColumns) {
            readColumns.put(column.getBaseHiveColumnIndex(), column.getHiveType().getType(typeManager, timestampPrecision));
        }
        RcFileReader rcFileReader = new RcFileReader(dataSource, rcFileEncoding, readColumns.buildOrThrow(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, BUFFER_SIZE);
        ConnectorPageSource pageSource = new RcFilePageSource(rcFileReader, projectedReaderColumns);
        return Optional.of(new ReaderPageSource(pageSource, readerProjections));
    } catch (Throwable e) {
        try {
            dataSource.close();
        } catch (IOException ignored) {
        }
        if (e instanceof TrinoException) {
            throw (TrinoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof RcFileCorruptionException) {
            throw new TrinoException(HIVE_BAD_DATA, message, e);
        }
        if (e instanceof BlockMissingException) {
            throw new TrinoException(HIVE_MISSING_DATA, message, e);
        }
        throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) BinaryRcFileEncoding(io.trino.rcfile.binary.BinaryRcFileEncoding) TextRcFileEncoding(io.trino.rcfile.text.TextRcFileEncoding) RcFileEncoding(io.trino.rcfile.RcFileEncoding) ConnectorPageSource(io.trino.spi.connector.ConnectorPageSource) RcFileCorruptionException(io.trino.rcfile.RcFileCorruptionException) EmptyPageSource(io.trino.spi.connector.EmptyPageSource) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) RcFileDataSourceId(io.trino.rcfile.RcFileDataSourceId) HiveTimestampPrecision(io.trino.plugin.hive.HiveTimestampPrecision) IOException(java.io.IOException) FSDataInputStreamTail(io.trino.plugin.hive.util.FSDataInputStreamTail) RcFileReader(io.trino.rcfile.RcFileReader) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) TrinoException(io.trino.spi.TrinoException) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) RcFileCorruptionException(io.trino.rcfile.RcFileCorruptionException) ImmutableMap(com.google.common.collect.ImmutableMap) Type(io.trino.spi.type.Type) MemoryRcFileDataSource(io.trino.rcfile.MemoryRcFileDataSource) HadoopCodecFactory(io.trino.rcfile.HadoopCodecFactory) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TrinoException(io.trino.spi.TrinoException) ReaderPageSource(io.trino.plugin.hive.ReaderPageSource) ReaderColumns(io.trino.plugin.hive.ReaderColumns) BinaryRcFileEncoding(io.trino.rcfile.binary.BinaryRcFileEncoding) AircompressorCodecFactory(io.trino.rcfile.AircompressorCodecFactory) RcFileDataSource(io.trino.rcfile.RcFileDataSource) MemoryRcFileDataSource(io.trino.rcfile.MemoryRcFileDataSource)

Example 2 with FSDataInputStreamTail

use of io.trino.plugin.hive.util.FSDataInputStreamTail in project trino by trinodb.

the class HdfsOrcDataSource method readTailInternal.

@Override
protected Slice readTailInternal(int length) throws IOException {
    // Handle potentially imprecise file lengths by reading the footer
    long readStart = System.nanoTime();
    FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(getId().toString(), getEstimatedSize(), inputStream, length);
    Slice tailSlice = fileTail.getTailSlice();
    stats.readDataBytesPerSecond(tailSlice.length(), System.nanoTime() - readStart);
    return tailSlice;
}
Also used : Slice(io.airlift.slice.Slice) FSDataInputStreamTail(io.trino.plugin.hive.util.FSDataInputStreamTail)

Example 3 with FSDataInputStreamTail

use of io.trino.plugin.hive.util.FSDataInputStreamTail in project trino by trinodb.

the class HdfsParquetDataSource method readTail.

@Override
public Slice readTail(int length) {
    long start = System.nanoTime();
    Slice tailSlice;
    try {
        // Handle potentially imprecise file lengths by reading the footer
        FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(getId().toString(), getEstimatedSize(), inputStream, length);
        tailSlice = fileTail.getTailSlice();
    } catch (IOException e) {
        throw new TrinoException(HIVE_FILESYSTEM_ERROR, format("Error reading tail from %s with length %s", id, length), e);
    }
    long currentReadTimeNanos = System.nanoTime() - start;
    readTimeNanos += currentReadTimeNanos;
    readBytes += tailSlice.length();
    return tailSlice;
}
Also used : Slice(io.airlift.slice.Slice) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) FSDataInputStreamTail(io.trino.plugin.hive.util.FSDataInputStreamTail)

Aggregations

FSDataInputStreamTail (io.trino.plugin.hive.util.FSDataInputStreamTail)3 Slice (io.airlift.slice.Slice)2 TrinoException (io.trino.spi.TrinoException)2 IOException (java.io.IOException)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 HiveColumnHandle (io.trino.plugin.hive.HiveColumnHandle)1 HiveTimestampPrecision (io.trino.plugin.hive.HiveTimestampPrecision)1 ReaderColumns (io.trino.plugin.hive.ReaderColumns)1 ReaderPageSource (io.trino.plugin.hive.ReaderPageSource)1 AircompressorCodecFactory (io.trino.rcfile.AircompressorCodecFactory)1 HadoopCodecFactory (io.trino.rcfile.HadoopCodecFactory)1 MemoryRcFileDataSource (io.trino.rcfile.MemoryRcFileDataSource)1 RcFileCorruptionException (io.trino.rcfile.RcFileCorruptionException)1 RcFileDataSource (io.trino.rcfile.RcFileDataSource)1 RcFileDataSourceId (io.trino.rcfile.RcFileDataSourceId)1 RcFileEncoding (io.trino.rcfile.RcFileEncoding)1 RcFileReader (io.trino.rcfile.RcFileReader)1 BinaryRcFileEncoding (io.trino.rcfile.binary.BinaryRcFileEncoding)1 TextRcFileEncoding (io.trino.rcfile.text.TextRcFileEncoding)1 ConnectorPageSource (io.trino.spi.connector.ConnectorPageSource)1