use of org.apache.hadoop.hive.ql.io.orc.RecordReader in project h2o-3 by h2oai.
the class OrcTestUtils method compareFrameContents.
static int compareFrameContents(String fileName, Set<String> failedFiles, Frame h2oFrame, Reader orcReader, String[] colTypes, String[] colNames, boolean[] toInclude) {
// get all stripe info
List<StripeInformation> stripesInfo = orcReader.getStripes();
int wrongTests = 0;
if (stripesInfo.size() == 0) {
// Orc file contains no data
assertEquals("Orc file is empty. H2O frame row number should be zero: ", 0, h2oFrame.numRows());
} else {
// row index into H2O frame
Long startRowIndex = 0L;
for (StripeInformation oneStripe : stripesInfo) {
try {
RecordReader perStripe = orcReader.rows(oneStripe.getOffset(), oneStripe.getDataLength(), toInclude, null, colNames);
// read orc file stripes in vectorizedRowBatch
VectorizedRowBatch batch = perStripe.nextBatch(null);
boolean done = false;
Long rowCounts = 0L;
// row number of current stripe
Long rowNumber = oneStripe.getNumberOfRows();
while (!done) {
// row number of current batch
long currentBatchRow = batch.count();
ColumnVector[] dataVectors = batch.cols;
int colIndex = 0;
for (int cIdx = 0; cIdx < batch.numCols; cIdx++) {
// read one column at a time;
if (toInclude[cIdx + 1]) {
compare1Cloumn(dataVectors[cIdx], colTypes[colIndex].toLowerCase(), colIndex, currentBatchRow, h2oFrame.vec(colNames[colIndex]), startRowIndex);
colIndex++;
}
}
// record number of rows of data actually read
rowCounts = rowCounts + currentBatchRow;
startRowIndex = startRowIndex + currentBatchRow;
if (// read all rows of the stripe already.
rowCounts >= rowNumber)
done = true;
if (// not done yet, get next batch
!done)
batch = perStripe.nextBatch(batch);
}
perStripe.close();
} catch (Throwable e) {
failedFiles.add(fileName);
e.printStackTrace();
wrongTests += 1;
}
}
}
return wrongTests;
}
use of org.apache.hadoop.hive.ql.io.orc.RecordReader in project hive by apache.
the class TestStreaming method dumpBucket.
private ArrayList<SampleRec> dumpBucket(Path orcFile) throws IOException {
org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration());
Reader reader = OrcFile.createReader(orcFile, OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
StructObjectInspector inspector = (StructObjectInspector) reader.getObjectInspector();
System.out.format("Found Bucket File : %s \n", orcFile.getName());
ArrayList<SampleRec> result = new ArrayList<SampleRec>();
while (rows.hasNext()) {
Object row = rows.next(null);
SampleRec rec = (SampleRec) deserializeDeltaFileRow(row, inspector)[5];
result.add(rec);
}
return result;
}
use of org.apache.hadoop.hive.ql.io.orc.RecordReader in project h2o-3 by h2oai.
the class OrcParser method parseChunk.
/**
* This method calculates the number of stripes that will be read for each chunk. Since
* only single threading is supported in reading each stripe, we will never split one stripe
* over different chunks.
*
* @param chunkId: chunk index, calculated as file size/chunk size. The file size is calculated
* with data plus overhead in terms of headers and other info, number of chunks
* calculated will be higher than the actual chunks needed. If the chunk number
* is too high, the method will return without writing to
* dout.
* @param din: ParseReader, not used for parsing orc files
* @param dout: ParseWriter, used to add data to H2O frame.
* @return: Parsewriter dout.
*/
@Override
protected final ParseWriter parseChunk(int chunkId, ParseReader din, ParseWriter dout) {
_cidx = chunkId;
// only do something if within file size and the orc file is not empty
List<StripeInformation> stripesInfo = ((OrcParseSetup) this._setup).getStripes();
if (stripesInfo.size() == 0) {
dout.addError(new ParseWriter.ParseErr("Orc Parser: Empty file.", chunkId, 0L, -2L));
// empty file
return dout;
}
OrcParseSetup setup = (OrcParseSetup) this._setup;
// get one stripe
StripeInformation thisStripe = stripesInfo.get(chunkId);
// write one stripe of data to H2O frame
String[] orcTypes = setup.getColumnTypesString();
boolean[] toInclude = setup.getToInclude();
try {
RecordReader perStripe = orcFileReader.rows(thisStripe.getOffset(), thisStripe.getDataLength(), setup.getToInclude(), null, setup.getColumnNames());
VectorizedRowBatch batch = null;
long rows = 0;
long rowCount = thisStripe.getNumberOfRows();
while (rows != rowCount) {
// read orc file stripes in vectorizedRowBatch
batch = perStripe.nextBatch(batch);
long currentBatchRow = batch.count();
int nrows = (int) currentBatchRow;
if (currentBatchRow != nrows)
throw new IllegalArgumentException("got batch with too many records, does not fit in int");
ColumnVector[] dataVectors = batch.cols;
int colIndex = 0;
for (int col = 0; col < batch.numCols; ++col) {
// read one column at a time;
if (toInclude[col + 1]) {
// only write a column if we actually want it
write1column(dataVectors[col], orcTypes[colIndex], colIndex, nrows, dout);
colIndex++;
}
}
// record number of rows of data actually read
rows += currentBatchRow;
}
perStripe.close();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
return dout;
}
use of org.apache.hadoop.hive.ql.io.orc.RecordReader in project presto by prestodb.
the class OrcFileRewriter method rewrite.
public static OrcFileInfo rewrite(File input, File output, BitSet rowsToDelete) throws IOException {
try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(FileSystem.class.getClassLoader());
FileSystem fileSystem = new SyncingFileSystem(CONFIGURATION)) {
Reader reader = createReader(fileSystem, path(input));
if (reader.getNumberOfRows() < rowsToDelete.length()) {
throw new IOException("File has fewer rows than deletion vector");
}
int deleteRowCount = rowsToDelete.cardinality();
if (reader.getNumberOfRows() == deleteRowCount) {
return new OrcFileInfo(0, 0);
}
if (reader.getNumberOfRows() >= Integer.MAX_VALUE) {
throw new IOException("File has too many rows");
}
int inputRowCount = toIntExact(reader.getNumberOfRows());
WriterOptions writerOptions = new OrcWriterOptions(CONFIGURATION).memory(new NullMemoryManager(CONFIGURATION)).fileSystem(fileSystem).compress(reader.getCompression()).inspector(reader.getObjectInspector());
long start = System.nanoTime();
try (Closer<RecordReader, IOException> recordReader = closer(reader.rows(), RecordReader::close);
Closer<Writer, IOException> writer = closer(createWriter(path(output), writerOptions), Writer::close)) {
if (reader.hasMetadataValue(OrcFileMetadata.KEY)) {
ByteBuffer orcFileMetadata = reader.getMetadataValue(OrcFileMetadata.KEY);
writer.get().addUserMetadata(OrcFileMetadata.KEY, orcFileMetadata);
}
OrcFileInfo fileInfo = rewrite(recordReader.get(), writer.get(), rowsToDelete, inputRowCount);
log.debug("Rewrote file %s in %s (input rows: %s, output rows: %s)", input.getName(), nanosSince(start), inputRowCount, inputRowCount - deleteRowCount);
return fileInfo;
}
}
}
Aggregations