Search in sources :

Example 1 with RecordReader

use of org.apache.hadoop.hive.ql.io.orc.RecordReader in project h2o-3 by h2oai.

the class OrcTestUtils method compareFrameContents.

static int compareFrameContents(String fileName, Set<String> failedFiles, Frame h2oFrame, Reader orcReader, String[] colTypes, String[] colNames, boolean[] toInclude) {
    // get all stripe info
    List<StripeInformation> stripesInfo = orcReader.getStripes();
    int wrongTests = 0;
    if (stripesInfo.size() == 0) {
        // Orc file contains no data
        assertEquals("Orc file is empty.  H2O frame row number should be zero: ", 0, h2oFrame.numRows());
    } else {
        // row index into H2O frame
        Long startRowIndex = 0L;
        for (StripeInformation oneStripe : stripesInfo) {
            try {
                RecordReader perStripe = orcReader.rows(oneStripe.getOffset(), oneStripe.getDataLength(), toInclude, null, colNames);
                // read orc file stripes in vectorizedRowBatch
                VectorizedRowBatch batch = perStripe.nextBatch(null);
                boolean done = false;
                Long rowCounts = 0L;
                // row number of current stripe
                Long rowNumber = oneStripe.getNumberOfRows();
                while (!done) {
                    // row number of current batch
                    long currentBatchRow = batch.count();
                    ColumnVector[] dataVectors = batch.cols;
                    int colIndex = 0;
                    for (int cIdx = 0; cIdx < batch.numCols; cIdx++) {
                        // read one column at a time;
                        if (toInclude[cIdx + 1]) {
                            compare1Cloumn(dataVectors[cIdx], colTypes[colIndex].toLowerCase(), colIndex, currentBatchRow, h2oFrame.vec(colNames[colIndex]), startRowIndex);
                            colIndex++;
                        }
                    }
                    // record number of rows of data actually read
                    rowCounts = rowCounts + currentBatchRow;
                    startRowIndex = startRowIndex + currentBatchRow;
                    if (// read all rows of the stripe already.
                    rowCounts >= rowNumber)
                        done = true;
                    if (// not done yet, get next batch
                    !done)
                        batch = perStripe.nextBatch(batch);
                }
                perStripe.close();
            } catch (Throwable e) {
                failedFiles.add(fileName);
                e.printStackTrace();
                wrongTests += 1;
            }
        }
    }
    return wrongTests;
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) StripeInformation(org.apache.hadoop.hive.ql.io.orc.StripeInformation) DecimalColumnVector(org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)

Example 2 with RecordReader

use of org.apache.hadoop.hive.ql.io.orc.RecordReader in project hive by apache.

the class TestStreaming method dumpBucket.

private ArrayList<SampleRec> dumpBucket(Path orcFile) throws IOException {
    org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration());
    Reader reader = OrcFile.createReader(orcFile, OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    StructObjectInspector inspector = (StructObjectInspector) reader.getObjectInspector();
    System.out.format("Found Bucket File : %s \n", orcFile.getName());
    ArrayList<SampleRec> result = new ArrayList<SampleRec>();
    while (rows.hasNext()) {
        Object row = rows.next(null);
        SampleRec rec = (SampleRec) deserializeDeltaFileRow(row, inspector)[5];
        result.add(rec);
    }
    return result;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) ArrayList(java.util.ArrayList) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 3 with RecordReader

use of org.apache.hadoop.hive.ql.io.orc.RecordReader in project h2o-3 by h2oai.

the class OrcParser method parseChunk.

/**
   * This method calculates the number of stripes that will be read for each chunk.  Since
   * only single threading is supported in reading each stripe, we will never split one stripe
   * over different chunks.
   *
   * @param chunkId: chunk index, calculated as file size/chunk size.  The file size is calculated
   *            with data plus overhead in terms of headers and other info, number of chunks
   *            calculated will be higher than the actual chunks needed.  If the chunk number
   *            is too high, the method will return without writing to
   *            dout.
   * @param din: ParseReader, not used for parsing orc files
   * @param dout: ParseWriter, used to add data to H2O frame.
   * @return: Parsewriter dout.
   */
@Override
protected final ParseWriter parseChunk(int chunkId, ParseReader din, ParseWriter dout) {
    _cidx = chunkId;
    // only do something if within file size and the orc file is not empty
    List<StripeInformation> stripesInfo = ((OrcParseSetup) this._setup).getStripes();
    if (stripesInfo.size() == 0) {
        dout.addError(new ParseWriter.ParseErr("Orc Parser: Empty file.", chunkId, 0L, -2L));
        // empty file
        return dout;
    }
    OrcParseSetup setup = (OrcParseSetup) this._setup;
    // get one stripe
    StripeInformation thisStripe = stripesInfo.get(chunkId);
    // write one stripe of data to H2O frame
    String[] orcTypes = setup.getColumnTypesString();
    boolean[] toInclude = setup.getToInclude();
    try {
        RecordReader perStripe = orcFileReader.rows(thisStripe.getOffset(), thisStripe.getDataLength(), setup.getToInclude(), null, setup.getColumnNames());
        VectorizedRowBatch batch = null;
        long rows = 0;
        long rowCount = thisStripe.getNumberOfRows();
        while (rows != rowCount) {
            // read orc file stripes in vectorizedRowBatch
            batch = perStripe.nextBatch(batch);
            long currentBatchRow = batch.count();
            int nrows = (int) currentBatchRow;
            if (currentBatchRow != nrows)
                throw new IllegalArgumentException("got batch with too many records, does not fit in int");
            ColumnVector[] dataVectors = batch.cols;
            int colIndex = 0;
            for (int col = 0; col < batch.numCols; ++col) {
                // read one column at a time;
                if (toInclude[col + 1]) {
                    // only write a column if we actually want it
                    write1column(dataVectors[col], orcTypes[colIndex], colIndex, nrows, dout);
                    colIndex++;
                }
            }
            // record number of rows of data actually read
            rows += currentBatchRow;
        }
        perStripe.close();
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
    return dout;
}
Also used : RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) IOException(java.io.IOException) StripeInformation(org.apache.hadoop.hive.ql.io.orc.StripeInformation)

Example 4 with RecordReader

use of org.apache.hadoop.hive.ql.io.orc.RecordReader in project presto by prestodb.

the class OrcFileRewriter method rewrite.

public static OrcFileInfo rewrite(File input, File output, BitSet rowsToDelete) throws IOException {
    try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader());
        FileSystem fileSystem = new SyncingFileSystem(CONFIGURATION)) {
        Reader reader = createReader(fileSystem, path(input));
        if (reader.getNumberOfRows() < rowsToDelete.length()) {
            throw new IOException("File has fewer rows than deletion vector");
        }
        int deleteRowCount = rowsToDelete.cardinality();
        if (reader.getNumberOfRows() == deleteRowCount) {
            return new OrcFileInfo(0, 0);
        }
        if (reader.getNumberOfRows() >= Integer.MAX_VALUE) {
            throw new IOException("File has too many rows");
        }
        int inputRowCount = toIntExact(reader.getNumberOfRows());
        WriterOptions writerOptions = new OrcWriterOptions(CONFIGURATION).memory(new NullMemoryManager(CONFIGURATION)).fileSystem(fileSystem).compress(reader.getCompression()).inspector(reader.getObjectInspector());
        long start = System.nanoTime();
        try (Closer<RecordReader, IOException> recordReader = closer(reader.rows(), RecordReader::close);
            Closer<Writer, IOException> writer = closer(createWriter(path(output), writerOptions), Writer::close)) {
            if (reader.hasMetadataValue(OrcFileMetadata.KEY)) {
                ByteBuffer orcFileMetadata = reader.getMetadataValue(OrcFileMetadata.KEY);
                writer.get().addUserMetadata(OrcFileMetadata.KEY, orcFileMetadata);
            }
            OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, inputRowCount);
            log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
            return fileInfo;
        }
    }
}
Also used : RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) OrcFile.createReader(org.apache.hadoop.hive.ql.io.orc.OrcFile.createReader) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) NullMemoryManager(org.apache.hadoop.hive.ql.io.orc.NullMemoryManager) ByteBuffer(java.nio.ByteBuffer) OrcWriterOptions(org.apache.hadoop.hive.ql.io.orc.OrcWriterOptions) SyncingFileSystem(com.facebook.presto.raptor.util.SyncingFileSystem) SyncingFileSystem(com.facebook.presto.raptor.util.SyncingFileSystem) FileSystem(org.apache.hadoop.fs.FileSystem) OrcWriterOptions(org.apache.hadoop.hive.ql.io.orc.OrcWriterOptions) WriterOptions(org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions) ThreadContextClassLoader(com.facebook.presto.spi.classloader.ThreadContextClassLoader) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) OrcFile.createWriter(org.apache.hadoop.hive.ql.io.orc.OrcFile.createWriter)

Aggregations

RecordReader (org.apache.hadoop.hive.ql.io.orc.RecordReader)4 IOException (java.io.IOException)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Reader (org.apache.hadoop.hive.ql.io.orc.Reader)2 StripeInformation (org.apache.hadoop.hive.ql.io.orc.StripeInformation)2 SyncingFileSystem (com.facebook.presto.raptor.util.SyncingFileSystem)1 ThreadContextClassLoader (com.facebook.presto.spi.classloader.ThreadContextClassLoader)1 InterruptedIOException (java.io.InterruptedIOException)1 ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 Configuration (org.apache.hadoop.conf.Configuration)1 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)1 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)1 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)1 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)1 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)1 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)1 NullMemoryManager (org.apache.hadoop.hive.ql.io.orc.NullMemoryManager)1 WriterOptions (org.apache.hadoop.hive.ql.io.orc.OrcFile.WriterOptions)1 OrcFile.createReader (org.apache.hadoop.hive.ql.io.orc.OrcFile.createReader)1