use of org.apache.orc.TypeDescription in project hive by apache.
the class TestOrcFile method testUnionAndTimestamp.
/**
* We test union, timestamp, and decimal separately since we need to make the
* object inspector manually. (The Hive reflection-based doesn't handle
* them properly.)
*/
@Test
public void testUnionAndTimestamp() throws Exception {
List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT).addFieldNames("time").addFieldNames("union").addFieldNames("decimal").addSubtypes(1).addSubtypes(2).addSubtypes(5).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.TIMESTAMP).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.UNION).addSubtypes(3).addSubtypes(4).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRING).build());
types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.DECIMAL).build());
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = OrcStruct.createObjectInspector(0, types);
}
HiveDecimal maxValue = HiveDecimal.create("10000000000000000000");
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(1000).compress(CompressionKind.NONE).batchSize(1000).bufferSize(100).blockPadding(false));
OrcStruct row = new OrcStruct(3);
OrcUnion union = new OrcUnion();
row.setFieldValue(1, union);
row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf("2000-03-12 15:00:00")));
HiveDecimal value = HiveDecimal.create("12345678.6547456");
row.setFieldValue(2, new HiveDecimalWritable(value));
union.set((byte) 0, new IntWritable(42));
writer.addRow(row);
row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf("2000-03-20 12:00:00.123456789")));
union.set((byte) 1, new Text("hello"));
value = HiveDecimal.create("-5643.234");
row.setFieldValue(2, new HiveDecimalWritable(value));
writer.addRow(row);
row.setFieldValue(0, null);
row.setFieldValue(1, null);
row.setFieldValue(2, null);
writer.addRow(row);
row.setFieldValue(1, union);
union.set((byte) 0, null);
writer.addRow(row);
union.set((byte) 1, null);
writer.addRow(row);
union.set((byte) 0, new IntWritable(200000));
row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf("1970-01-01 00:00:00")));
value = HiveDecimal.create("10000000000000000000");
row.setFieldValue(2, new HiveDecimalWritable(value));
writer.addRow(row);
Random rand = new Random(42);
for (int i = 1970; i < 2038; ++i) {
row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf(i + "-05-05 12:34:56." + i)));
if ((i & 1) == 0) {
union.set((byte) 0, new IntWritable(i * i));
} else {
union.set((byte) 1, new Text(Integer.toString(i * i)));
}
value = HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18));
row.setFieldValue(2, new HiveDecimalWritable(value));
if (maxValue.compareTo(value) < 0) {
maxValue = value;
}
writer.addRow(row);
}
// let's add a lot of constant rows to test the rle
row.setFieldValue(0, null);
union.set((byte) 0, new IntWritable(1732050807));
row.setFieldValue(2, null);
for (int i = 0; i < 5000; ++i) {
writer.addRow(row);
}
union.set((byte) 0, new IntWritable(0));
writer.addRow(row);
union.set((byte) 0, new IntWritable(10));
writer.addRow(row);
union.set((byte) 0, new IntWritable(138));
writer.addRow(row);
writer.close();
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
TypeDescription schema = writer.getSchema();
assertEquals(5, schema.getMaximumId());
boolean[] expected = new boolean[] { false, false, false, false, false, false };
boolean[] included = OrcUtils.includeColumns("", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] { false, true, false, false, false, true };
included = OrcUtils.includeColumns("time,decimal", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] { false, false, true, true, true, false };
included = OrcUtils.includeColumns("union", schema);
assertEquals(true, Arrays.equals(expected, included));
assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
assertEquals(5077, reader.getNumberOfRows());
DecimalColumnStatistics stats = (DecimalColumnStatistics) reader.getStatistics()[5];
assertEquals(71, stats.getNumberOfValues());
assertEquals(HiveDecimal.create("-5643.234"), stats.getMinimum());
assertEquals(maxValue, stats.getMaximum());
// TODO: fix this
// assertEquals(null,stats.getSum());
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for (StripeInformation stripe : reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getLength();
}
}
assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(2, stripeCount);
assertEquals(reader.getContentLength(), currentOffset);
RecordReader rows = reader.rows();
assertEquals(0, rows.getRowNumber());
assertEquals(0.0, rows.getProgress(), 0.000001);
assertEquals(true, rows.hasNext());
row = (OrcStruct) rows.next(null);
assertEquals(1, rows.getRowNumber());
inspector = reader.getObjectInspector();
assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>", inspector.getTypeName());
assertEquals(new TimestampWritable(Timestamp.valueOf("2000-03-12 15:00:00")), row.getFieldValue(0));
union = (OrcUnion) row.getFieldValue(1);
assertEquals(0, union.getTag());
assertEquals(new IntWritable(42), union.getObject());
assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")), row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(2, rows.getRowNumber());
assertEquals(new TimestampWritable(Timestamp.valueOf("2000-03-20 12:00:00.123456789")), row.getFieldValue(0));
assertEquals(1, union.getTag());
assertEquals(new Text("hello"), union.getObject());
assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(null, row.getFieldValue(0));
assertEquals(null, row.getFieldValue(1));
assertEquals(null, row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(null, row.getFieldValue(0));
union = (OrcUnion) row.getFieldValue(1);
assertEquals(0, union.getTag());
assertEquals(null, union.getObject());
assertEquals(null, row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(null, row.getFieldValue(0));
assertEquals(1, union.getTag());
assertEquals(null, union.getObject());
assertEquals(null, row.getFieldValue(2));
row = (OrcStruct) rows.next(row);
assertEquals(new TimestampWritable(Timestamp.valueOf("1970-01-01 00:00:00")), row.getFieldValue(0));
assertEquals(new IntWritable(200000), union.getObject());
assertEquals(new HiveDecimalWritable(HiveDecimal.create("10000000000000000000")), row.getFieldValue(2));
rand = new Random(42);
for (int i = 1970; i < 2038; ++i) {
row = (OrcStruct) rows.next(row);
assertEquals(new TimestampWritable(Timestamp.valueOf(i + "-05-05 12:34:56." + i)), row.getFieldValue(0));
if ((i & 1) == 0) {
assertEquals(0, union.getTag());
assertEquals(new IntWritable(i * i), union.getObject());
} else {
assertEquals(1, union.getTag());
assertEquals(new Text(Integer.toString(i * i)), union.getObject());
}
assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18))), row.getFieldValue(2));
}
for (int i = 0; i < 5000; ++i) {
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(1732050807), union.getObject());
}
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(0), union.getObject());
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(10), union.getObject());
row = (OrcStruct) rows.next(row);
assertEquals(new IntWritable(138), union.getObject());
assertEquals(false, rows.hasNext());
assertEquals(1.0, rows.getProgress(), 0.00001);
assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
rows.seekToRow(1);
row = (OrcStruct) rows.next(row);
assertEquals(new TimestampWritable(Timestamp.valueOf("2000-03-20 12:00:00.123456789")), row.getFieldValue(0));
assertEquals(1, union.getTag());
assertEquals(new Text("hello"), union.getObject());
assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), row.getFieldValue(2));
rows.close();
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class EncodedTreeReaderFactory method createEncodedTreeReader.
private static TreeReader createEncodedTreeReader(TypeDescription schema, List<OrcProto.ColumnEncoding> encodings, OrcEncodedColumnBatch batch, CompressionCodec codec, TreeReaderFactory.Context context) throws IOException {
int columnIndex = schema.getId();
ColumnStreamData[] streamBuffers = null;
List<ColumnVector> vectors = null;
if (batch.hasData(columnIndex)) {
streamBuffers = batch.getColumnData(columnIndex);
} else if (batch.hasVectors(columnIndex)) {
vectors = batch.getColumnVectors(columnIndex);
} else {
throw new AssertionError("Batch has no data for " + columnIndex + ": " + batch);
}
// EncodedColumnBatch is already decompressed, we don't really need to pass codec.
// But we need to know if the original data is compressed or not. This is used to skip
// positions in row index properly. If the file is originally compressed,
// then 1st position (compressed offset) in row index should be skipped to get
// uncompressed offset, else 1st position should not be skipped.
// TODO: there should be a better way to do this, code just needs to be modified
OrcProto.ColumnEncoding columnEncoding = encodings.get(columnIndex);
// stream buffers are arranged in enum order of stream kind
ColumnStreamData present = null, data = null, dictionary = null, lengths = null, secondary = null;
if (streamBuffers != null) {
present = streamBuffers[OrcProto.Stream.Kind.PRESENT_VALUE];
data = streamBuffers[OrcProto.Stream.Kind.DATA_VALUE];
dictionary = streamBuffers[OrcProto.Stream.Kind.DICTIONARY_DATA_VALUE];
lengths = streamBuffers[OrcProto.Stream.Kind.LENGTH_VALUE];
secondary = streamBuffers[OrcProto.Stream.Kind.SECONDARY_VALUE];
}
if (LOG.isDebugEnabled()) {
LOG.debug("columnIndex: {} columnType: {} streamBuffers.length: {} vectors: {} columnEncoding: {}" + " present: {} data: {} dictionary: {} lengths: {} secondary: {} tz: {}", columnIndex, schema, streamBuffers == null ? 0 : streamBuffers.length, vectors == null ? 0 : vectors.size(), columnEncoding, present != null, data, dictionary != null, lengths != null, secondary != null, context.getWriterTimezone());
}
// TODO: get rid of the builders - they serve no purpose... just call ctors directly.
switch(schema.getCategory()) {
case BINARY:
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case CHAR:
case VARCHAR:
case STRING:
case DECIMAL:
case TIMESTAMP:
case DATE:
return getPrimitiveTreeReader(columnIndex, schema, codec, columnEncoding, present, data, dictionary, lengths, secondary, context, vectors);
case LIST:
// Not currently supported.
assert vectors == null;
TypeDescription elementType = schema.getChildren().get(0);
TreeReader elementReader = createEncodedTreeReader(elementType, encodings, batch, codec, context);
return ListStreamReader.builder().setColumnIndex(columnIndex).setColumnEncoding(columnEncoding).setCompressionCodec(codec).setPresentStream(present).setLengthStream(lengths).setElementReader(elementReader).setContext(context).build();
case MAP:
// Not currently supported.
assert vectors == null;
TypeDescription keyType = schema.getChildren().get(0);
TypeDescription valueType = schema.getChildren().get(1);
TreeReader keyReader = createEncodedTreeReader(keyType, encodings, batch, codec, context);
TreeReader valueReader = createEncodedTreeReader(valueType, encodings, batch, codec, context);
return MapStreamReader.builder().setColumnIndex(columnIndex).setColumnEncoding(columnEncoding).setCompressionCodec(codec).setPresentStream(present).setLengthStream(lengths).setKeyReader(keyReader).setValueReader(valueReader).setContext(context).build();
case STRUCT:
{
// Not currently supported.
assert vectors == null;
int childCount = schema.getChildren().size();
TreeReader[] childReaders = new TreeReader[childCount];
for (int i = 0; i < childCount; i++) {
TypeDescription childType = schema.getChildren().get(i);
childReaders[i] = createEncodedTreeReader(childType, encodings, batch, codec, context);
}
return StructStreamReader.builder().setColumnIndex(columnIndex).setCompressionCodec(codec).setColumnEncoding(columnEncoding).setPresentStream(present).setChildReaders(childReaders).setContext(context).build();
}
case UNION:
{
// Not currently supported.
assert vectors == null;
int childCount = schema.getChildren().size();
TreeReader[] childReaders = new TreeReader[childCount];
for (int i = 0; i < childCount; i++) {
TypeDescription childType = schema.getChildren().get(i);
childReaders[i] = createEncodedTreeReader(childType, encodings, batch, codec, context);
}
return UnionStreamReader.builder().setColumnIndex(columnIndex).setCompressionCodec(codec).setColumnEncoding(columnEncoding).setPresentStream(present).setDataStream(data).setChildReaders(childReaders).setContext(context).build();
}
default:
throw new UnsupportedOperationException("Data type not supported: " + schema);
}
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class OrcInputFormat method createOptionsForReader.
static Reader.Options createOptionsForReader(Configuration conf) {
/**
* Do we have schema on read in the configuration variables?
*/
TypeDescription schema = OrcInputFormat.getDesiredRowTypeDescr(conf, true, Integer.MAX_VALUE);
Reader.Options readerOptions = new Reader.Options().schema(schema);
// TODO: Convert genIncludedColumns and setSearchArgument to use TypeDescription.
final List<OrcProto.Type> schemaTypes = OrcUtils.getOrcTypes(schema);
readerOptions.include(OrcInputFormat.genIncludedColumns(schema, conf));
OrcInputFormat.setSearchArgument(readerOptions, schemaTypes, conf, true);
return readerOptions;
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class OrcInputFormat method genIncludedColumnsReverse.
/**
* Reverses genIncludedColumns; produces the table columns indexes from ORC included columns.
* @param readerSchema The ORC reader schema for the table.
* @param included The included ORC columns.
* @param isFullColumnMatch Whether full column match should be enforced (i.e. whether to expect
* that all the sub-columns or a complex type column should be included or excluded
* together in the included array. If false, any sub-column being included for a complex
* type is sufficient for the entire complex column to be included in the result.
* @return The list of table column indexes.
*/
public static List<Integer> genIncludedColumnsReverse(TypeDescription readerSchema, boolean[] included, boolean isFullColumnMatch) {
assert included != null;
List<Integer> result = new ArrayList<>();
List<TypeDescription> children = readerSchema.getChildren();
for (int columnNumber = 0; columnNumber < children.size(); ++columnNumber) {
TypeDescription child = children.get(columnNumber);
int id = child.getId();
int maxId = child.getMaximumId();
if (id >= included.length || maxId >= included.length) {
throw new AssertionError("Inconsistent includes: " + included.length + " elements; found column ID " + id);
}
boolean isIncluded = included[id];
for (int col = id + 1; col <= maxId; ++col) {
if (isFullColumnMatch && included[col] != isIncluded) {
throw new AssertionError("Inconsistent includes: root column IDs are [" + id + ", " + maxId + "]; included[" + col + "] = " + included[col] + ", which is different " + " from the previous IDs of the same root column.");
}
isIncluded = isIncluded || included[col];
}
if (isIncluded) {
result.add(columnNumber);
}
}
return result;
}
use of org.apache.orc.TypeDescription in project hive by apache.
the class OrcInputFormat method getDesiredRowTypeDescr.
/**
* Generate the desired schema for reading the file.
* @param conf the configuration
* @param isAcidRead is this an acid format?
* @param dataColumns the desired number of data columns for vectorized read
* @return the desired schema or null if schema evolution isn't enabled
* @throws IllegalArgumentException
*/
public static TypeDescription getDesiredRowTypeDescr(Configuration conf, boolean isAcidRead, int dataColumns) {
String columnNameProperty = null;
String columnTypeProperty = null;
ArrayList<String> schemaEvolutionColumnNames = null;
ArrayList<TypeDescription> schemaEvolutionTypeDescrs = null;
boolean haveSchemaEvolutionProperties = false;
if (isAcidRead || HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION)) {
columnNameProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS);
columnTypeProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES);
haveSchemaEvolutionProperties = (columnNameProperty != null && columnTypeProperty != null);
if (haveSchemaEvolutionProperties) {
schemaEvolutionColumnNames = Lists.newArrayList(columnNameProperty.split(","));
if (schemaEvolutionColumnNames.size() == 0) {
haveSchemaEvolutionProperties = false;
} else {
schemaEvolutionTypeDescrs = typeDescriptionsFromHiveTypeProperty(columnTypeProperty, dataColumns);
if (schemaEvolutionTypeDescrs.size() != Math.min(dataColumns, schemaEvolutionColumnNames.size())) {
haveSchemaEvolutionProperties = false;
}
}
} else if (isAcidRead) {
throw new IllegalArgumentException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg());
}
}
if (haveSchemaEvolutionProperties) {
if (LOG.isInfoEnabled()) {
LOG.info("Using schema evolution configuration variables schema.evolution.columns " + schemaEvolutionColumnNames.toString() + " / schema.evolution.columns.types " + schemaEvolutionTypeDescrs.toString() + " (isAcidRead " + isAcidRead + ")");
}
} else {
// Try regular properties;
columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS);
columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES);
if (columnTypeProperty == null || columnNameProperty == null) {
return null;
}
schemaEvolutionColumnNames = Lists.newArrayList(columnNameProperty.split(","));
if (schemaEvolutionColumnNames.size() == 0) {
return null;
}
schemaEvolutionTypeDescrs = typeDescriptionsFromHiveTypeProperty(columnTypeProperty, dataColumns);
if (schemaEvolutionTypeDescrs.size() != Math.min(dataColumns, schemaEvolutionColumnNames.size())) {
return null;
}
// Find first virtual column and clip them off.
int virtualColumnClipNum = -1;
int columnNum = 0;
for (String columnName : schemaEvolutionColumnNames) {
if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(columnName)) {
virtualColumnClipNum = columnNum;
break;
}
columnNum++;
}
if (virtualColumnClipNum != -1 && virtualColumnClipNum < dataColumns) {
schemaEvolutionColumnNames = Lists.newArrayList(schemaEvolutionColumnNames.subList(0, virtualColumnClipNum));
schemaEvolutionTypeDescrs = Lists.newArrayList(schemaEvolutionTypeDescrs.subList(0, virtualColumnClipNum));
}
if (LOG.isInfoEnabled()) {
LOG.info("Using column configuration variables columns " + schemaEvolutionColumnNames.toString() + " / columns.types " + schemaEvolutionTypeDescrs.toString() + " (isAcidRead " + isAcidRead + ")");
}
}
// Desired schema does not include virtual columns or partition columns.
TypeDescription result = TypeDescription.createStruct();
for (int i = 0; i < schemaEvolutionTypeDescrs.size(); i++) {
result.addField(schemaEvolutionColumnNames.get(i), schemaEvolutionTypeDescrs.get(i));
}
return result;
}
Aggregations