Search in sources :

Example 1 with SeekableInputStream

use of org.apache.parquet.io.SeekableInputStream in project hive by apache.

the class VectorizedParquetRecordReader method readSplitFooter.

private ParquetMetadata readSplitFooter(JobConf configuration, final Path file, Object cacheKey, MetadataFilter filter, String tag) throws IOException {
    MemoryBufferOrBuffers footerData = (cacheKey == null || metadataCache == null) ? null : metadataCache.getFileMetadata(cacheKey);
    if (footerData != null) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Found the footer in cache for " + cacheKey);
        }
        try {
            return ParquetFileReader.readFooter(new ParquetFooterInputFromCache(footerData), filter);
        } finally {
            metadataCache.decRefBuffer(footerData);
        }
    }
    final FileSystem fs = file.getFileSystem(configuration);
    final FileStatus stat = fs.getFileStatus(file);
    if (cacheKey == null || metadataCache == null) {
        return readFooterFromFile(file, fs, stat, filter);
    }
    // Parquet calls protobuf methods directly on the stream and we can't get bytes after the fact.
    try (SeekableInputStream stream = HadoopStreams.wrap(fs.open(file))) {
        long footerLengthIndex = stat.getLen() - ParquetFooterInputFromCache.FOOTER_LENGTH_SIZE - ParquetFileWriter.MAGIC.length;
        stream.seek(footerLengthIndex);
        int footerLength = BytesUtils.readIntLittleEndian(stream);
        stream.seek(footerLengthIndex - footerLength);
        if (LOG.isInfoEnabled()) {
            LOG.info("Caching the footer of length " + footerLength + " for " + cacheKey);
        }
        footerData = metadataCache.putFileMetadata(cacheKey, footerLength, stream, tag);
        try {
            return ParquetFileReader.readFooter(new ParquetFooterInputFromCache(footerData), filter);
        } finally {
            metadataCache.decRefBuffer(footerData);
        }
    }
}
Also used : MemoryBufferOrBuffers(org.apache.hadoop.hive.common.io.encoded.MemoryBufferOrBuffers) FileStatus(org.apache.hadoop.fs.FileStatus) SeekableInputStream(org.apache.parquet.io.SeekableInputStream) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 2 with SeekableInputStream

use of org.apache.parquet.io.SeekableInputStream in project drill by apache.

the class DirectBufInputStream method read.

public synchronized int read(DrillBuf buf, int off, int len) throws IOException {
    buf.clear();
    ByteBuffer directBuffer = buf.nioBuffer(0, len);
    int lengthLeftToRead = len;
    SeekableInputStream seekableInputStream = HadoopStreams.wrap(getInputStream());
    while (lengthLeftToRead > 0) {
        if (logger.isTraceEnabled()) {
            logger.trace("PERF: Disk read start. {}, StartOffset: {}, TotalByteSize: {}", this.streamId, this.startOffset, this.totalByteSize);
        }
        Stopwatch timer = Stopwatch.createStarted();
        int bytesRead = seekableInputStream.read(directBuffer);
        if (bytesRead < 0) {
            return bytesRead;
        }
        lengthLeftToRead -= bytesRead;
        if (logger.isTraceEnabled()) {
            logger.trace("PERF: Disk read complete. {}, StartOffset: {}, TotalByteSize: {}, BytesRead: {}, Time: {} ms", this.streamId, this.startOffset, this.totalByteSize, bytesRead, ((double) timer.elapsed(TimeUnit.MICROSECONDS)) / 1000);
        }
    }
    buf.writerIndex(len);
    return len;
}
Also used : SeekableInputStream(org.apache.parquet.io.SeekableInputStream) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) ByteBuffer(java.nio.ByteBuffer)

Example 3 with SeekableInputStream

use of org.apache.parquet.io.SeekableInputStream in project hive by apache.

the class VectorizedParquetRecordReader method readSplitFooter.

private ParquetMetadata readSplitFooter(JobConf configuration, final Path file, Object cacheKey, MetadataFilter filter, CacheTag tag) throws IOException {
    MemoryBufferOrBuffers footerData = (cacheKey == null || metadataCache == null) ? null : metadataCache.getFileMetadata(cacheKey);
    if (footerData != null) {
        LOG.info("Found the footer in cache for " + cacheKey);
        try {
            return ParquetFileReader.readFooter(new ParquetFooterInputFromCache(footerData), filter);
        } finally {
            metadataCache.decRefBuffer(footerData);
        }
    } else {
        throwIfCacheOnlyRead(isReadCacheOnly);
    }
    final FileSystem fs = file.getFileSystem(configuration);
    final FileStatus stat = fs.getFileStatus(file);
    if (cacheKey == null || metadataCache == null) {
        return readFooterFromFile(file, fs, stat, filter);
    }
    // Parquet calls protobuf methods directly on the stream and we can't get bytes after the fact.
    try (SeekableInputStream stream = HadoopStreams.wrap(fs.open(file))) {
        long footerLengthIndex = stat.getLen() - ParquetFooterInputFromCache.FOOTER_LENGTH_SIZE - ParquetFileWriter.MAGIC.length;
        stream.seek(footerLengthIndex);
        int footerLength = BytesUtils.readIntLittleEndian(stream);
        stream.seek(footerLengthIndex - footerLength);
        LOG.info("Caching the footer of length " + footerLength + " for " + cacheKey);
        // Note: we don't pass in isStopped here - this is not on an IO thread.
        footerData = metadataCache.putFileMetadata(cacheKey, footerLength, stream, tag, null);
        try {
            return ParquetFileReader.readFooter(new ParquetFooterInputFromCache(footerData), filter);
        } finally {
            metadataCache.decRefBuffer(footerData);
        }
    }
}
Also used : MemoryBufferOrBuffers(org.apache.hadoop.hive.common.io.encoded.MemoryBufferOrBuffers) FileStatus(org.apache.hadoop.fs.FileStatus) SeekableInputStream(org.apache.parquet.io.SeekableInputStream) FileSystem(org.apache.hadoop.fs.FileSystem)

Aggregations

SeekableInputStream (org.apache.parquet.io.SeekableInputStream)3 FileStatus (org.apache.hadoop.fs.FileStatus)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 MemoryBufferOrBuffers (org.apache.hadoop.hive.common.io.encoded.MemoryBufferOrBuffers)2 ByteBuffer (java.nio.ByteBuffer)1 Stopwatch (org.apache.drill.shaded.guava.com.google.common.base.Stopwatch)1