Search in sources :

Example 31 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class InternalParquetRecordReader method checkRead.

private void checkRead() throws IOException {
    if (current == totalCountLoadedSoFar) {
        if (current != 0) {
            totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
            if (LOG.isInfoEnabled()) {
                LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: " + ((float) totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float) totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
                final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
                if (totalTime != 0) {
                    final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
                    final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
                    LOG.info("time spent so far " + percentReading + "% reading (" + totalTimeSpentReadingBytes + " ms) and " + percentProcessing + "% processing (" + totalTimeSpentProcessingRecords + " ms)");
                }
            }
        }
        LOG.info("at row " + current + ". reading next block");
        long t0 = System.currentTimeMillis();
        PageReadStore pages = reader.readNextFilteredRowGroup();
        if (pages == null) {
            throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
        }
        long timeSpentReading = System.currentTimeMillis() - t0;
        totalTimeSpentReadingBytes += timeSpentReading;
        BenchmarkCounter.incrementTime(timeSpentReading);
        if (LOG.isInfoEnabled())
            LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount());
        LOG.debug("initializing Record assembly with requested schema {}", requestedSchema);
        MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
        recordReader = columnIO.getRecordReader(pages, recordConverter, filterRecords ? filter : FilterCompat.NOOP);
        startedAssemblingCurrentBlockAt = System.currentTimeMillis();
        totalCountLoadedSoFar += pages.getRowCount();
        ++currentBlock;
    }
}
Also used : PageReadStore(org.apache.parquet.column.page.PageReadStore) IOException(java.io.IOException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO)

Aggregations

PageReadStore (org.apache.parquet.column.page.PageReadStore)31 Configuration (org.apache.hadoop.conf.Configuration)22 Path (org.apache.hadoop.fs.Path)22 IOException (java.io.IOException)14 MessageType (org.apache.parquet.schema.MessageType)14 Test (org.junit.Test)13 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)12 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)8 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 RecordReader (org.apache.parquet.io.RecordReader)7 DataPageV1 (org.apache.parquet.column.page.DataPageV1)6 Encoding (org.apache.parquet.column.Encoding)5 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)5 File (java.io.File)4 List (java.util.List)4 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4