use of org.apache.orc.RecordReader in project flink by apache.
the class OrcBulkWriterTestUtil method getResults.
private static List<Record> getResults(Reader reader) throws IOException {
List<Record> results = new ArrayList<>();
RecordReader recordReader = reader.rows();
VectorizedRowBatch batch = reader.getSchema().createRowBatch();
while (recordReader.nextBatch(batch)) {
BytesColumnVector stringVector = (BytesColumnVector) batch.cols[0];
LongColumnVector intVector = (LongColumnVector) batch.cols[1];
for (int r = 0; r < batch.size; r++) {
String name = new String(stringVector.vector[r], stringVector.start[r], stringVector.length[r]);
int age = (int) intVector.vector[r];
results.add(new Record(name, age));
}
recordReader.close();
}
return results;
}
use of org.apache.orc.RecordReader in project druid by druid-io.
the class OrcReader method intermediateRowIterator.
@Override
protected CloseableIterator<OrcStruct> intermediateRowIterator() throws IOException {
final Closer closer = Closer.create();
// We fetch here to cache a copy locally. However, this might need to be changed if we want to split an orc file
// into several InputSplits in the future.
final byte[] buffer = new byte[InputEntity.DEFAULT_FETCH_BUFFER_SIZE];
final CleanableFile file = closer.register(source.fetch(temporaryDirectory, buffer));
final Path path = new Path(file.file().toURI());
final ClassLoader currentClassLoader = Thread.currentThread().getContextClassLoader();
final Reader reader;
try {
Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
reader = closer.register(OrcFile.createReader(path, OrcFile.readerOptions(conf)));
} finally {
Thread.currentThread().setContextClassLoader(currentClassLoader);
}
// The below line will get the schmea to read the whole columns.
// This can be improved by projecting some columns only what users want in the future.
final TypeDescription schema = reader.getSchema();
final RecordReader batchReader = reader.rows(reader.options());
final OrcMapredRecordReader<OrcStruct> recordReader = new OrcMapredRecordReader<>(batchReader, schema);
closer.register(recordReader::close);
return new CloseableIterator<OrcStruct>() {
final NullWritable key = recordReader.createKey();
OrcStruct value = null;
@Override
public boolean hasNext() {
if (value == null) {
try {
// The returned OrcStruct in next() can be kept in memory for a while.
// Here, we create a new instance of OrcStruct before calling RecordReader.next(),
// so that we can avoid to share the same reference to the "value" across rows.
value = recordReader.createValue();
if (!recordReader.next(key, value)) {
value = null;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return value != null;
}
@Override
public OrcStruct next() {
if (value == null) {
throw new NoSuchElementException();
}
final OrcStruct currentValue = value;
value = null;
return currentValue;
}
@Override
public void close() throws IOException {
closer.close();
}
};
}
use of org.apache.orc.RecordReader in project hive by apache.
the class TestCrudCompactorOnTez method checkBucketIdAndRowIdInAcidFile.
/**
* Read file, and
* 1. make sure that the bucket property in each row matches the file name.
* For example, if the bucketId is 0, we check file bucket_00000 to make sure that the third
* column contains only the value 536870912.
* 2. make sure that rowIds are in ascending order
* @param fs file system
* @param path where to look for the bucket file
* @param bucketId bucket Id to check, e.g. 0.
*/
private void checkBucketIdAndRowIdInAcidFile(FileSystem fs, Path path, int bucketId) throws IOException {
Path bucketFilePath = AcidUtils.createBucketFile(path, bucketId);
Reader orcReader = OrcFile.createReader(bucketFilePath, OrcFile.readerOptions(fs.getConf()).filesystem(fs));
TypeDescription schema = orcReader.getSchema();
try (RecordReader rows = orcReader.rows()) {
VectorizedRowBatch batch = schema.createRowBatch();
rows.nextBatch(batch);
// check that bucket property in each row matches the bucket in the file name
long[] bucketIdVector = ((LongColumnVector) batch.cols[2]).vector;
for (int i = 0; i < batch.count(); i++) {
Assert.assertEquals(bucketId, decodeBucketProperty(bucketIdVector[i]));
}
// check that writeIds, then rowIds are sorted in ascending order
long[] writeIdVector = ((LongColumnVector) batch.cols[1]).vector;
long[] rowIdVector = ((LongColumnVector) batch.cols[3]).vector;
long writeId = writeIdVector[0];
long rowId = 0;
for (int i = 0; i < batch.count(); i++) {
long currentWriteId = writeIdVector[i];
long currentRowId = rowIdVector[i];
if (writeId == writeIdVector[i]) {
Assert.assertTrue(rowId <= currentRowId);
rowId = currentRowId;
} else {
Assert.assertTrue(writeId < currentWriteId);
writeId = currentWriteId;
rowId = 0;
}
}
}
}
Aggregations