use of org.apache.hadoop.hive.ql.io.orc.StripeInformation in project h2o-3 by h2oai.
the class OrcParserProvider method readSetup.
/**
* This method will create the readers and others info needed to parse an orc file.
* In addition, it will not over-ride the columnNames, columnTypes that the user
* may want to force upon it. However, we only allow users to set column types to
* enum at this point and ignore all the other requests.
*
* @param f
* @param columnNames
* @param columnTypes
* @return
*/
public ParseSetup readSetup(FileVec f, String[] columnNames, byte[] columnTypes) {
try {
Reader orcFileReader = getReader(f);
StructObjectInspector insp = (StructObjectInspector) orcFileReader.getObjectInspector();
OrcParser.OrcParseSetup stp = OrcParser.deriveParseSetup(orcFileReader, insp);
// change back the columnNames and columnTypes if they are specified already
if (!(columnNames == null) && (stp.getAllColNames().length == columnNames.length)) {
// copy column name
stp.setColumnNames(columnNames);
stp.setAllColNames(columnNames);
}
if (!(columnTypes == null) && (columnTypes.length == stp.getColumnTypes().length)) {
// copy enum type only
byte[] old_columnTypes = stp.getColumnTypes();
String[] old_columnTypeNames = stp.getColumnTypesString();
for (int index = 0; index < columnTypes.length; index++) {
if (// only copy the enum types
columnTypes[index] == Vec.T_CAT)
old_columnTypes[index] = columnTypes[index];
}
stp.setColumnTypes(old_columnTypes);
stp.setColumnTypeStrings(old_columnTypeNames);
}
List<StripeInformation> stripesInfo = orcFileReader.getStripes();
if (stripesInfo.size() == 0) {
// empty file
f.setChunkSize(stp._chunk_size = (int) f.length());
return stp;
}
f.setNChunks(stripesInfo.size());
stp._chunk_size = f._chunkSize;
// ORC parser needs one-to one mapping between chunk and strip (just ids, offsets do not matter)
assert f.nChunks() == stripesInfo.size();
return stp;
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
use of org.apache.hadoop.hive.ql.io.orc.StripeInformation in project h2o-3 by h2oai.
the class OrcTestUtils method compareFrameContents.
static int compareFrameContents(String fileName, Set<String> failedFiles, Frame h2oFrame, Reader orcReader, String[] colTypes, String[] colNames, boolean[] toInclude) {
// get all stripe info
List<StripeInformation> stripesInfo = orcReader.getStripes();
int wrongTests = 0;
if (stripesInfo.size() == 0) {
// Orc file contains no data
assertEquals("Orc file is empty. H2O frame row number should be zero: ", 0, h2oFrame.numRows());
} else {
// row index into H2O frame
Long startRowIndex = 0L;
for (StripeInformation oneStripe : stripesInfo) {
try {
RecordReader perStripe = orcReader.rows(oneStripe.getOffset(), oneStripe.getDataLength(), toInclude, null, colNames);
// read orc file stripes in vectorizedRowBatch
VectorizedRowBatch batch = perStripe.nextBatch(null);
boolean done = false;
Long rowCounts = 0L;
// row number of current stripe
Long rowNumber = oneStripe.getNumberOfRows();
while (!done) {
// row number of current batch
long currentBatchRow = batch.count();
ColumnVector[] dataVectors = batch.cols;
int colIndex = 0;
for (int cIdx = 0; cIdx < batch.numCols; cIdx++) {
// read one column at a time;
if (toInclude[cIdx + 1]) {
compare1Cloumn(dataVectors[cIdx], colTypes[colIndex].toLowerCase(), colIndex, currentBatchRow, h2oFrame.vec(colNames[colIndex]), startRowIndex);
colIndex++;
}
}
// record number of rows of data actually read
rowCounts = rowCounts + currentBatchRow;
startRowIndex = startRowIndex + currentBatchRow;
if (// read all rows of the stripe already.
rowCounts >= rowNumber)
done = true;
if (// not done yet, get next batch
!done)
batch = perStripe.nextBatch(batch);
}
perStripe.close();
} catch (Throwable e) {
failedFiles.add(fileName);
e.printStackTrace();
wrongTests += 1;
}
}
}
return wrongTests;
}
use of org.apache.hadoop.hive.ql.io.orc.StripeInformation in project h2o-3 by h2oai.
the class OrcParser method parseChunk.
/**
* This method calculates the number of stripes that will be read for each chunk. Since
* only single threading is supported in reading each stripe, we will never split one stripe
* over different chunks.
*
* @param chunkId: chunk index, calculated as file size/chunk size. The file size is calculated
* with data plus overhead in terms of headers and other info, number of chunks
* calculated will be higher than the actual chunks needed. If the chunk number
* is too high, the method will return without writing to
* dout.
* @param din: ParseReader, not used for parsing orc files
* @param dout: ParseWriter, used to add data to H2O frame.
* @return: Parsewriter dout.
*/
@Override
protected final ParseWriter parseChunk(int chunkId, ParseReader din, ParseWriter dout) {
_cidx = chunkId;
// only do something if within file size and the orc file is not empty
List<StripeInformation> stripesInfo = ((OrcParseSetup) this._setup).getStripes();
if (stripesInfo.size() == 0) {
dout.addError(new ParseWriter.ParseErr("Orc Parser: Empty file.", chunkId, 0L, -2L));
// empty file
return dout;
}
OrcParseSetup setup = (OrcParseSetup) this._setup;
// get one stripe
StripeInformation thisStripe = stripesInfo.get(chunkId);
// write one stripe of data to H2O frame
String[] orcTypes = setup.getColumnTypesString();
boolean[] toInclude = setup.getToInclude();
try {
RecordReader perStripe = orcFileReader.rows(thisStripe.getOffset(), thisStripe.getDataLength(), setup.getToInclude(), null, setup.getColumnNames());
VectorizedRowBatch batch = null;
long rows = 0;
long rowCount = thisStripe.getNumberOfRows();
while (rows != rowCount) {
// read orc file stripes in vectorizedRowBatch
batch = perStripe.nextBatch(batch);
long currentBatchRow = batch.count();
int nrows = (int) currentBatchRow;
if (currentBatchRow != nrows)
throw new IllegalArgumentException("got batch with too many records, does not fit in int");
ColumnVector[] dataVectors = batch.cols;
int colIndex = 0;
for (int col = 0; col < batch.numCols; ++col) {
// read one column at a time;
if (toInclude[col + 1]) {
// only write a column if we actually want it
write1column(dataVectors[col], orcTypes[colIndex], colIndex, nrows, dout);
colIndex++;
}
}
// record number of rows of data actually read
rows += currentBatchRow;
}
perStripe.close();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
return dout;
}
Aggregations