Search in sources :

Example 1 with StripeInformation

use of org.apache.hadoop.hive.ql.io.orc.StripeInformation in project h2o-3 by h2oai.

the class OrcParserProvider method readSetup.

/**
   * This method will create the readers and others info needed to parse an orc file.
   * In addition, it will not over-ride the columnNames, columnTypes that the user
   * may want to force upon it.  However, we only allow users to set column types to
   * enum at this point and ignore all the other requests.
   *
   * @param f
   * @param columnNames
   * @param columnTypes
   * @return
   */
public ParseSetup readSetup(FileVec f, String[] columnNames, byte[] columnTypes) {
    try {
        Reader orcFileReader = getReader(f);
        StructObjectInspector insp = (StructObjectInspector) orcFileReader.getObjectInspector();
        OrcParser.OrcParseSetup stp = OrcParser.deriveParseSetup(orcFileReader, insp);
        // change back the columnNames and columnTypes if they are specified already
        if (!(columnNames == null) && (stp.getAllColNames().length == columnNames.length)) {
            // copy column name
            stp.setColumnNames(columnNames);
            stp.setAllColNames(columnNames);
        }
        if (!(columnTypes == null) && (columnTypes.length == stp.getColumnTypes().length)) {
            // copy enum type only
            byte[] old_columnTypes = stp.getColumnTypes();
            String[] old_columnTypeNames = stp.getColumnTypesString();
            for (int index = 0; index < columnTypes.length; index++) {
                if (// only copy the enum types
                columnTypes[index] == Vec.T_CAT)
                    old_columnTypes[index] = columnTypes[index];
            }
            stp.setColumnTypes(old_columnTypes);
            stp.setColumnTypeStrings(old_columnTypeNames);
        }
        List<StripeInformation> stripesInfo = orcFileReader.getStripes();
        if (stripesInfo.size() == 0) {
            // empty file
            f.setChunkSize(stp._chunk_size = (int) f.length());
            return stp;
        }
        f.setNChunks(stripesInfo.size());
        stp._chunk_size = f._chunkSize;
        // ORC parser needs one-to one mapping between chunk and strip (just ids, offsets do not matter)
        assert f.nChunks() == stripesInfo.size();
        return stp;
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
}
Also used : Reader(org.apache.hadoop.hive.ql.io.orc.Reader) IOException(java.io.IOException) StripeInformation(org.apache.hadoop.hive.ql.io.orc.StripeInformation) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 2 with StripeInformation

use of org.apache.hadoop.hive.ql.io.orc.StripeInformation in project h2o-3 by h2oai.

the class OrcTestUtils method compareFrameContents.

static int compareFrameContents(String fileName, Set<String> failedFiles, Frame h2oFrame, Reader orcReader, String[] colTypes, String[] colNames, boolean[] toInclude) {
    // get all stripe info
    List<StripeInformation> stripesInfo = orcReader.getStripes();
    int wrongTests = 0;
    if (stripesInfo.size() == 0) {
        // Orc file contains no data
        assertEquals("Orc file is empty.  H2O frame row number should be zero: ", 0, h2oFrame.numRows());
    } else {
        // row index into H2O frame
        Long startRowIndex = 0L;
        for (StripeInformation oneStripe : stripesInfo) {
            try {
                RecordReader perStripe = orcReader.rows(oneStripe.getOffset(), oneStripe.getDataLength(), toInclude, null, colNames);
                // read orc file stripes in vectorizedRowBatch
                VectorizedRowBatch batch = perStripe.nextBatch(null);
                boolean done = false;
                Long rowCounts = 0L;
                // row number of current stripe
                Long rowNumber = oneStripe.getNumberOfRows();
                while (!done) {
                    // row number of current batch
                    long currentBatchRow = batch.count();
                    ColumnVector[] dataVectors = batch.cols;
                    int colIndex = 0;
                    for (int cIdx = 0; cIdx < batch.numCols; cIdx++) {
                        // read one column at a time;
                        if (toInclude[cIdx + 1]) {
                            compare1Cloumn(dataVectors[cIdx], colTypes[colIndex].toLowerCase(), colIndex, currentBatchRow, h2oFrame.vec(colNames[colIndex]), startRowIndex);
                            colIndex++;
                        }
                    }
                    // record number of rows of data actually read
                    rowCounts = rowCounts + currentBatchRow;
                    startRowIndex = startRowIndex + currentBatchRow;
                    if (// read all rows of the stripe already.
                    rowCounts >= rowNumber)
                        done = true;
                    if (// not done yet, get next batch
                    !done)
                        batch = perStripe.nextBatch(batch);
                }
                perStripe.close();
            } catch (Throwable e) {
                failedFiles.add(fileName);
                e.printStackTrace();
                wrongTests += 1;
            }
        }
    }
    return wrongTests;
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) StripeInformation(org.apache.hadoop.hive.ql.io.orc.StripeInformation) DecimalColumnVector(org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)

Example 3 with StripeInformation

use of org.apache.hadoop.hive.ql.io.orc.StripeInformation in project h2o-3 by h2oai.

the class OrcParser method parseChunk.

/**
   * This method calculates the number of stripes that will be read for each chunk.  Since
   * only single threading is supported in reading each stripe, we will never split one stripe
   * over different chunks.
   *
   * @param chunkId: chunk index, calculated as file size/chunk size.  The file size is calculated
   *            with data plus overhead in terms of headers and other info, number of chunks
   *            calculated will be higher than the actual chunks needed.  If the chunk number
   *            is too high, the method will return without writing to
   *            dout.
   * @param din: ParseReader, not used for parsing orc files
   * @param dout: ParseWriter, used to add data to H2O frame.
   * @return: Parsewriter dout.
   */
@Override
protected final ParseWriter parseChunk(int chunkId, ParseReader din, ParseWriter dout) {
    _cidx = chunkId;
    // only do something if within file size and the orc file is not empty
    List<StripeInformation> stripesInfo = ((OrcParseSetup) this._setup).getStripes();
    if (stripesInfo.size() == 0) {
        dout.addError(new ParseWriter.ParseErr("Orc Parser: Empty file.", chunkId, 0L, -2L));
        // empty file
        return dout;
    }
    OrcParseSetup setup = (OrcParseSetup) this._setup;
    // get one stripe
    StripeInformation thisStripe = stripesInfo.get(chunkId);
    // write one stripe of data to H2O frame
    String[] orcTypes = setup.getColumnTypesString();
    boolean[] toInclude = setup.getToInclude();
    try {
        RecordReader perStripe = orcFileReader.rows(thisStripe.getOffset(), thisStripe.getDataLength(), setup.getToInclude(), null, setup.getColumnNames());
        VectorizedRowBatch batch = null;
        long rows = 0;
        long rowCount = thisStripe.getNumberOfRows();
        while (rows != rowCount) {
            // read orc file stripes in vectorizedRowBatch
            batch = perStripe.nextBatch(batch);
            long currentBatchRow = batch.count();
            int nrows = (int) currentBatchRow;
            if (currentBatchRow != nrows)
                throw new IllegalArgumentException("got batch with too many records, does not fit in int");
            ColumnVector[] dataVectors = batch.cols;
            int colIndex = 0;
            for (int col = 0; col < batch.numCols; ++col) {
                // read one column at a time;
                if (toInclude[col + 1]) {
                    // only write a column if we actually want it
                    write1column(dataVectors[col], orcTypes[colIndex], colIndex, nrows, dout);
                    colIndex++;
                }
            }
            // record number of rows of data actually read
            rows += currentBatchRow;
        }
        perStripe.close();
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
    return dout;
}
Also used : RecordReader(org.apache.hadoop.hive.ql.io.orc.RecordReader) IOException(java.io.IOException) StripeInformation(org.apache.hadoop.hive.ql.io.orc.StripeInformation)

Aggregations

StripeInformation (org.apache.hadoop.hive.ql.io.orc.StripeInformation)3 IOException (java.io.IOException)2 RecordReader (org.apache.hadoop.hive.ql.io.orc.RecordReader)2 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)1 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)1 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)1 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)1 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)1 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)1 Reader (org.apache.hadoop.hive.ql.io.orc.Reader)1 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)1