Search in sources :

Example 1 with StripeInformation

use of in project h2o-3 by h2oai.

the class OrcParserProvider method readSetup.

   * This method will create the readers and others info needed to parse an orc file.
   * In addition, it will not over-ride the columnNames, columnTypes that the user
   * may want to force upon it.  However, we only allow users to set column types to
   * enum at this point and ignore all the other requests.
   * @param f
   * @param columnNames
   * @param columnTypes
   * @return
public ParseSetup readSetup(FileVec f, String[] columnNames, byte[] columnTypes) {
    try {
        Reader orcFileReader = getReader(f);
        StructObjectInspector insp = (StructObjectInspector) orcFileReader.getObjectInspector();
        OrcParser.OrcParseSetup stp = OrcParser.deriveParseSetup(orcFileReader, insp);
        // change back the columnNames and columnTypes if they are specified already
        if (!(columnNames == null) && (stp.getAllColNames().length == columnNames.length)) {
            // copy column name
        if (!(columnTypes == null) && (columnTypes.length == stp.getColumnTypes().length)) {
            // copy enum type only
            byte[] old_columnTypes = stp.getColumnTypes();
            String[] old_columnTypeNames = stp.getColumnTypesString();
            for (int index = 0; index < columnTypes.length; index++) {
                if (// only copy the enum types
                columnTypes[index] == Vec.T_CAT)
                    old_columnTypes[index] = columnTypes[index];
        List<StripeInformation> stripesInfo = orcFileReader.getStripes();
        if (stripesInfo.size() == 0) {
            // empty file
            f.setChunkSize(stp._chunk_size = (int) f.length());
            return stp;
        stp._chunk_size = f._chunkSize;
        // ORC parser needs one-to one mapping between chunk and strip (just ids, offsets do not matter)
        assert f.nChunks() == stripesInfo.size();
        return stp;
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
Also used : Reader( IOException( StripeInformation( StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 2 with StripeInformation

use of in project h2o-3 by h2oai.

the class OrcTestUtils method compareFrameContents.

static int compareFrameContents(String fileName, Set<String> failedFiles, Frame h2oFrame, Reader orcReader, String[] colTypes, String[] colNames, boolean[] toInclude) {
    // get all stripe info
    List<StripeInformation> stripesInfo = orcReader.getStripes();
    int wrongTests = 0;
    if (stripesInfo.size() == 0) {
        // Orc file contains no data
        assertEquals("Orc file is empty.  H2O frame row number should be zero: ", 0, h2oFrame.numRows());
    } else {
        // row index into H2O frame
        Long startRowIndex = 0L;
        for (StripeInformation oneStripe : stripesInfo) {
            try {
                RecordReader perStripe = orcReader.rows(oneStripe.getOffset(), oneStripe.getDataLength(), toInclude, null, colNames);
                // read orc file stripes in vectorizedRowBatch
                VectorizedRowBatch batch = perStripe.nextBatch(null);
                boolean done = false;
                Long rowCounts = 0L;
                // row number of current stripe
                Long rowNumber = oneStripe.getNumberOfRows();
                while (!done) {
                    // row number of current batch
                    long currentBatchRow = batch.count();
                    ColumnVector[] dataVectors = batch.cols;
                    int colIndex = 0;
                    for (int cIdx = 0; cIdx < batch.numCols; cIdx++) {
                        // read one column at a time;
                        if (toInclude[cIdx + 1]) {
                            compare1Cloumn(dataVectors[cIdx], colTypes[colIndex].toLowerCase(), colIndex, currentBatchRow, h2oFrame.vec(colNames[colIndex]), startRowIndex);
                    // record number of rows of data actually read
                    rowCounts = rowCounts + currentBatchRow;
                    startRowIndex = startRowIndex + currentBatchRow;
                    if (// read all rows of the stripe already.
                    rowCounts >= rowNumber)
                        done = true;
                    if (// not done yet, get next batch
                        batch = perStripe.nextBatch(batch);
            } catch (Throwable e) {
                wrongTests += 1;
    return wrongTests;
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) RecordReader( StripeInformation( DecimalColumnVector(org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector) ColumnVector(org.apache.hadoop.hive.ql.exec.vector.ColumnVector) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)

Example 3 with StripeInformation

use of in project h2o-3 by h2oai.

the class OrcParser method parseChunk.

   * This method calculates the number of stripes that will be read for each chunk.  Since
   * only single threading is supported in reading each stripe, we will never split one stripe
   * over different chunks.
   * @param chunkId: chunk index, calculated as file size/chunk size.  The file size is calculated
   *            with data plus overhead in terms of headers and other info, number of chunks
   *            calculated will be higher than the actual chunks needed.  If the chunk number
   *            is too high, the method will return without writing to
   *            dout.
   * @param din: ParseReader, not used for parsing orc files
   * @param dout: ParseWriter, used to add data to H2O frame.
   * @return: Parsewriter dout.
protected final ParseWriter parseChunk(int chunkId, ParseReader din, ParseWriter dout) {
    _cidx = chunkId;
    // only do something if within file size and the orc file is not empty
    List<StripeInformation> stripesInfo = ((OrcParseSetup) this._setup).getStripes();
    if (stripesInfo.size() == 0) {
        dout.addError(new ParseWriter.ParseErr("Orc Parser: Empty file.", chunkId, 0L, -2L));
        // empty file
        return dout;
    OrcParseSetup setup = (OrcParseSetup) this._setup;
    // get one stripe
    StripeInformation thisStripe = stripesInfo.get(chunkId);
    // write one stripe of data to H2O frame
    String[] orcTypes = setup.getColumnTypesString();
    boolean[] toInclude = setup.getToInclude();
    try {
        RecordReader perStripe = orcFileReader.rows(thisStripe.getOffset(), thisStripe.getDataLength(), setup.getToInclude(), null, setup.getColumnNames());
        VectorizedRowBatch batch = null;
        long rows = 0;
        long rowCount = thisStripe.getNumberOfRows();
        while (rows != rowCount) {
            // read orc file stripes in vectorizedRowBatch
            batch = perStripe.nextBatch(batch);
            long currentBatchRow = batch.count();
            int nrows = (int) currentBatchRow;
            if (currentBatchRow != nrows)
                throw new IllegalArgumentException("got batch with too many records, does not fit in int");
            ColumnVector[] dataVectors = batch.cols;
            int colIndex = 0;
            for (int col = 0; col < batch.numCols; ++col) {
                // read one column at a time;
                if (toInclude[col + 1]) {
                    // only write a column if we actually want it
                    write1column(dataVectors[col], orcTypes[colIndex], colIndex, nrows, dout);
            // record number of rows of data actually read
            rows += currentBatchRow;
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    return dout;
Also used : RecordReader( IOException( StripeInformation(


StripeInformation ( IOException ( RecordReader ( BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)1 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)1 DecimalColumnVector (org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector)1 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)1 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)1 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)1 Reader ( StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)1