use of org.apache.orc.TypeDescription in project hive by apache.
the class OrcInputFormat method createOptionsForReader.
static Reader.Options createOptionsForReader(Configuration conf) {
/**
* Do we have schema on read in the configuration variables?
*/
TypeDescription schema = OrcInputFormat.getDesiredRowTypeDescr(conf, true, Integer.MAX_VALUE);
Reader.Options readerOptions = new Reader.Options().schema(schema);
// TODO: Convert genIncludedColumns and setSearchArgument to use TypeDescription.
final List<OrcProto.Type> schemaTypes = OrcUtils.getOrcTypes(schema);
readerOptions.include(OrcInputFormat.genIncludedColumns(schema, conf));
// todo: last param is bogus. why is this hardcoded?
OrcInputFormat.setSearchArgument(readerOptions, schemaTypes, conf, true);
return readerOptions;
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class OrcInputFormat method getDesiredRowTypeDescr.
/**
* Generate the desired schema for reading the file.
* @param conf the configuration
* @param isAcidRead is this an acid format?
* @param dataColumns the desired number of data columns for vectorized read
* @return the desired schema or null if schema evolution isn't enabled
* @throws IllegalArgumentException
*/
public static TypeDescription getDesiredRowTypeDescr(Configuration conf, boolean isAcidRead, int dataColumns) {
String columnNameProperty = null;
String columnTypeProperty = null;
ArrayList<String> schemaEvolutionColumnNames = null;
ArrayList<TypeDescription> schemaEvolutionTypeDescrs = null;
boolean haveSchemaEvolutionProperties = false;
if (isAcidRead || HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION)) {
columnNameProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS);
columnTypeProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES);
haveSchemaEvolutionProperties = (columnNameProperty != null && columnTypeProperty != null);
if (haveSchemaEvolutionProperties) {
schemaEvolutionColumnNames = Lists.newArrayList(columnNameProperty.split(","));
if (schemaEvolutionColumnNames.size() == 0) {
haveSchemaEvolutionProperties = false;
} else {
schemaEvolutionTypeDescrs = typeDescriptionsFromHiveTypeProperty(columnTypeProperty, dataColumns);
if (schemaEvolutionTypeDescrs.size() != Math.min(dataColumns, schemaEvolutionColumnNames.size())) {
haveSchemaEvolutionProperties = false;
}
}
} else if (isAcidRead) {
throw new IllegalArgumentException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg());
}
}
if (haveSchemaEvolutionProperties) {
if (LOG.isInfoEnabled()) {
LOG.info("Using schema evolution configuration variables schema.evolution.columns " + schemaEvolutionColumnNames.toString() + " / schema.evolution.columns.types " + schemaEvolutionTypeDescrs.toString() + " (isAcidRead " + isAcidRead + ")");
}
} else {
// Try regular properties;
columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS);
columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES);
if (columnTypeProperty == null || columnNameProperty == null) {
return null;
}
schemaEvolutionColumnNames = Lists.newArrayList(columnNameProperty.split(","));
if (schemaEvolutionColumnNames.size() == 0) {
return null;
}
schemaEvolutionTypeDescrs = typeDescriptionsFromHiveTypeProperty(columnTypeProperty, dataColumns);
if (schemaEvolutionTypeDescrs.size() != Math.min(dataColumns, schemaEvolutionColumnNames.size())) {
return null;
}
// Find first virtual column and clip them off.
int virtualColumnClipNum = -1;
int columnNum = 0;
for (String columnName : schemaEvolutionColumnNames) {
if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(columnName)) {
virtualColumnClipNum = columnNum;
break;
}
columnNum++;
}
if (virtualColumnClipNum != -1 && virtualColumnClipNum < dataColumns) {
schemaEvolutionColumnNames = Lists.newArrayList(schemaEvolutionColumnNames.subList(0, virtualColumnClipNum));
schemaEvolutionTypeDescrs = Lists.newArrayList(schemaEvolutionTypeDescrs.subList(0, virtualColumnClipNum));
}
if (LOG.isInfoEnabled()) {
LOG.info("Using column configuration variables columns " + schemaEvolutionColumnNames.toString() + " / columns.types " + schemaEvolutionTypeDescrs.toString() + " (isAcidRead " + isAcidRead + ")");
}
}
// Desired schema does not include virtual columns or partition columns.
TypeDescription result = TypeDescription.createStruct();
for (int i = 0; i < schemaEvolutionTypeDescrs.size(); i++) {
result.addField(schemaEvolutionColumnNames.get(i), schemaEvolutionTypeDescrs.get(i));
}
return result;
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class OrcEncodedDataConsumer method decodeBatch.
@Override
protected void decodeBatch(OrcEncodedColumnBatch batch, Consumer<ColumnVectorBatch> downstreamConsumer) throws InterruptedException {
long startTime = counters.startTimeCounter();
int currentStripeIndex = batch.getBatchKey().stripeIx;
boolean sameStripe = currentStripeIndex == previousStripeIndex;
try {
ConsumerStripeMetadata stripeMetadata = stripes.get(currentStripeIndex);
// Get non null row count from root column, to get max vector batches
int rgIdx = batch.getBatchKey().rgIx;
long nonNullRowCount = -1;
if (rgIdx == OrcEncodedColumnBatch.ALL_RGS) {
nonNullRowCount = stripeMetadata.getRowCount();
} else {
OrcProto.RowIndexEntry rowIndex = stripeMetadata.getRowIndexEntry(0, rgIdx);
nonNullRowCount = getRowCount(rowIndex);
}
int maxBatchesRG = (int) ((nonNullRowCount / VectorizedRowBatch.DEFAULT_SIZE) + 1);
int batchSize = VectorizedRowBatch.DEFAULT_SIZE;
TypeDescription fileSchema = fileMetadata.getSchema();
if (columnReaders == null || !sameStripe) {
createColumnReaders(batch, stripeMetadata, fileSchema);
} else {
repositionInStreams(this.columnReaders, batch, sameStripe, stripeMetadata);
}
previousStripeIndex = currentStripeIndex;
for (int i = 0; i < maxBatchesRG; i++) {
// for last batch in row group, adjust the batch size
if (i == maxBatchesRG - 1) {
batchSize = (int) (nonNullRowCount % VectorizedRowBatch.DEFAULT_SIZE);
if (batchSize == 0)
break;
}
ColumnVectorBatch cvb = cvbPool.take();
// assert cvb.cols.length == batch.getColumnIxs().length; // Must be constant per split.
cvb.size = batchSize;
for (int idx = 0; idx < columnReaders.length; ++idx) {
TreeReader reader = columnReaders[idx];
if (cvb.cols[idx] == null) {
// Orc store rows inside a root struct (hive writes it this way).
// When we populate column vectors we skip over the root struct.
cvb.cols[idx] = createColumn(batchSchemas[idx], VectorizedRowBatch.DEFAULT_SIZE);
}
trace.logTreeReaderNextVector(idx);
/*
* Currently, ORC's TreeReaderFactory class does this:
*
* public void nextBatch(VectorizedRowBatch batch,
* int batchSize) throws IOException {
* batch.cols[0].reset();
* batch.cols[0].ensureSize(batchSize, false);
* nextVector(batch.cols[0], null, batchSize);
* }
*
* CONCERN:
* For better performance, we'd like to *not* do a ColumnVector.reset()
* which zeroes out isNull. Why? Because there are common cases where
* ORC will *immediately* copy its null flags into the isNull array. This is a
* waste.
*
* For correctness now we must do it for now.
*
* The best solution is for ORC to manage the noNulls and isNull array itself
* because it knows what NULLs the next set of rows contains.
*
* Its management of the fields of ColumnVector is a little different than what we
* must do for vector expressions. For those, we must maintain the invariant that if
* noNulls is true there are no NULLs in any part of the isNull array. This is
* because the next vector expression relies on the invariant.
*
* Given that ORC (or any other producer) is providing *read-only* batches to the
* consumer, what is important is that the isNull array through batch.size has
* integrity with the noNulls flag. So, if ORC is giving us 100 rows (for example)
* and none of them are NULL, it can safely set or make sure the first 100 isNull
* entries are false and safely set noNulls to true. Any other NULLs (true entries)
* in isNull are irrelevant because ORC owns the batch. It just need to make sure
* it doesn't get confused.
*
*/
ColumnVector cv = cvb.cols[idx];
cv.reset();
cv.ensureSize(batchSize, false);
reader.nextVector(cv, null, batchSize);
}
// we are done reading a batch, send it to consumer for processing
downstreamConsumer.consumeData(cvb);
counters.incrCounter(LlapIOCounters.ROWS_EMITTED, batchSize);
}
LlapIoImpl.ORC_LOGGER.debug("Done with decode");
counters.incrTimeCounter(LlapIOCounters.DECODE_TIME_NS, startTime);
counters.incrCounter(LlapIOCounters.NUM_VECTOR_BATCHES, maxBatchesRG);
counters.incrCounter(LlapIOCounters.NUM_DECODED_BATCHES);
} catch (IOException e) {
// Caller will return the batch.
downstreamConsumer.setError(e);
}
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class TestInputOutputFormat method testColumnProjectionWithAcid.
/**
* Test column projection when using ACID.
*/
@Test
public void testColumnProjectionWithAcid() throws Exception {
Path baseDir = new Path(workDir, "base_00100");
testFilePath = new Path(baseDir, "bucket_00000");
fs.mkdirs(baseDir);
fs.delete(testFilePath, true);
TypeDescription fileSchema = TypeDescription.fromString("struct<operation:int," + "originalTransaction:bigint,bucket:int,rowId:bigint," + "currentTransaction:bigint," + "row:struct<a:int,b:struct<c:int>,d:string>>");
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs).setSchema(fileSchema).compress(org.apache.orc.CompressionKind.NONE));
VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
batch.size = 1000;
StructColumnVector scv = (StructColumnVector) batch.cols[5];
// operation
batch.cols[0].isRepeating = true;
((LongColumnVector) batch.cols[0]).vector[0] = 0;
// original transaction
batch.cols[1].isRepeating = true;
((LongColumnVector) batch.cols[1]).vector[0] = 1;
// bucket
batch.cols[2].isRepeating = true;
((LongColumnVector) batch.cols[2]).vector[0] = 0;
// current transaction
batch.cols[4].isRepeating = true;
((LongColumnVector) batch.cols[4]).vector[0] = 1;
LongColumnVector lcv = (LongColumnVector) ((StructColumnVector) scv.fields[1]).fields[0];
for (int r = 0; r < 1000; r++) {
// row id
((LongColumnVector) batch.cols[3]).vector[r] = r;
// a
((LongColumnVector) scv.fields[0]).vector[r] = r * 42;
// b.c
lcv.vector[r] = r * 10001;
// d
((BytesColumnVector) scv.fields[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
}
writer.addRowBatch(batch);
writer.addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME, ByteBuffer.wrap("0,0,999".getBytes(StandardCharsets.UTF_8)));
writer.close();
long fileLength = fs.getFileStatus(testFilePath).getLen();
// test with same schema with include
conf.set(ValidWriteIdList.VALID_WRITEIDS_KEY, "tbl:100:99:");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int>,string");
conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2");
OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir);
OrcInputFormat inputFormat = new OrcInputFormat();
AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
int record = 0;
RecordIdentifier id = reader.createKey();
OrcStruct struct = reader.createValue();
while (reader.next(id, struct)) {
assertEquals("id " + record, record, id.getRowId());
assertEquals("bucket " + record, 0, id.getBucketProperty());
assertEquals("writeid " + record, 1, id.getWriteId());
assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
assertEquals(null, struct.getFieldValue(1));
assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
record += 1;
}
assertEquals(1000, record);
reader.close();
// test with schema evolution and include
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d,f");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int,e:string>,string,int");
conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false");
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3");
split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength, workDir);
inputFormat = new OrcInputFormat();
reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf));
record = 0;
id = reader.createKey();
struct = reader.createValue();
while (reader.next(id, struct)) {
assertEquals("id " + record, record, id.getRowId());
assertEquals("bucket " + record, 0, id.getBucketProperty());
assertEquals("writeid " + record, 1, id.getWriteId());
assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get());
assertEquals(null, struct.getFieldValue(1));
assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString());
assertEquals("f " + record, null, struct.getFieldValue(3));
record += 1;
}
assertEquals(1000, record);
reader.close();
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class TestInputOutputFormat method testSchemaEvolution.
/**
* Test schema evolution when using the reader directly.
*/
@Test
public void testSchemaEvolution() throws Exception {
TypeDescription fileSchema = TypeDescription.fromString("struct<a:int,b:struct<c:int>,d:string>");
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs).setSchema(fileSchema).compress(org.apache.orc.CompressionKind.NONE));
VectorizedRowBatch batch = fileSchema.createRowBatch(1000);
batch.size = 1000;
LongColumnVector lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
for (int r = 0; r < 1000; r++) {
((LongColumnVector) batch.cols[0]).vector[r] = r * 42;
lcv.vector[r] = r * 10001;
((BytesColumnVector) batch.cols[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
}
writer.addRowBatch(batch);
writer.close();
TypeDescription readerSchema = TypeDescription.fromString("struct<a:int,b:struct<c:int,future1:int>,d:string,future2:int>");
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rowsOptions(new Reader.Options().schema(readerSchema));
batch = readerSchema.createRowBatch();
lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
LongColumnVector future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
assertEquals(true, rows.nextBatch(batch));
assertEquals(1000, batch.size);
assertEquals(true, future1.isRepeating);
assertEquals(true, future1.isNull[0]);
assertEquals(true, batch.cols[3].isRepeating);
assertEquals(true, batch.cols[3].isNull[0]);
for (int r = 0; r < batch.size; ++r) {
assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
assertEquals("row " + r, r * 10001, lcv.vector[r]);
assertEquals("row " + r, r * 10001, lcv.vector[r]);
assertEquals("row " + r, Integer.toHexString(r), ((BytesColumnVector) batch.cols[2]).toString(r));
}
assertEquals(false, rows.nextBatch(batch));
rows.close();
// try it again with an include vector
rows = reader.rowsOptions(new Reader.Options().schema(readerSchema).include(new boolean[] { false, true, true, true, false, false, true }));
batch = readerSchema.createRowBatch();
lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]);
future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]);
assertEquals(true, rows.nextBatch(batch));
assertEquals(1000, batch.size);
assertEquals(true, future1.isRepeating);
assertEquals(true, future1.isNull[0]);
assertEquals(true, batch.cols[3].isRepeating);
assertEquals(true, batch.cols[3].isNull[0]);
assertEquals(true, batch.cols[2].isRepeating);
assertEquals(true, batch.cols[2].isNull[0]);
for (int r = 0; r < batch.size; ++r) {
assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]);
assertEquals("row " + r, r * 10001, lcv.vector[r]);
}
assertEquals(false, rows.nextBatch(batch));
rows.close();
}
Aggregations