use of org.apache.parquet.thrift.struct.ThriftType.StructType in project parquet-mr by apache.
the class ThriftSchemaConvertVisitor method visit.
@Override
public ConvertedField visit(StructType structType, State state) {
// special care is taken when converting unions,
// because we are actually both converting + projecting in
// one pass, and unions need special handling when projecting.
final boolean needsToKeepOneOfEachUnion = keepOneOfEachUnion && isUnion(structType.getStructOrUnionType());
boolean hasSentinelUnionColumns = false;
boolean hasNonSentinelUnionColumns = false;
List<Type> convertedChildren = new ArrayList<Type>();
for (ThriftField child : structType.getChildren()) {
State childState = new State(state.path.push(child), getRepetition(child), child.getName());
ConvertedField converted = child.getType().accept(this, childState);
if (!converted.isKeep() && needsToKeepOneOfEachUnion) {
// user is not keeping this "kind" of union, but we still need
// to keep at least one of the primitives of this union around.
// in order to know what "kind" of union each record is.
// TODO: in the future, we should just filter these records out instead
// re-do the recursion, with a new projection filter that keeps only
// the first primitive it encounters
ConvertedField firstPrimitive = child.getType().accept(new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true, keepOneOfEachUnion), childState);
convertedChildren.add(firstPrimitive.asKeep().getType().withId(child.getFieldId()));
hasSentinelUnionColumns = true;
}
if (converted.isSentinelUnion()) {
// child field is a sentinel union that we should drop if possible
if (childState.repetition == REQUIRED) {
// but this field is required, so we may still need it
convertedChildren.add(converted.asSentinelUnion().getType().withId(child.getFieldId()));
hasSentinelUnionColumns = true;
}
} else if (converted.isKeep()) {
// user has selected this column, so we keep it.
convertedChildren.add(converted.asKeep().getType().withId(child.getFieldId()));
hasNonSentinelUnionColumns = true;
}
}
if (!hasNonSentinelUnionColumns && hasSentinelUnionColumns) {
// we may not be able to, so tag this as a sentinel.
return new SentinelUnion(state.path, new GroupType(state.repetition, state.name, convertedChildren));
}
if (hasNonSentinelUnionColumns) {
// user requested some of the fields of this struct, so we keep the struct
return new Keep(state.path, new GroupType(state.repetition, state.name, convertedChildren));
} else {
// user requested none of the fields of this struct, so we drop it
return new Drop(state.path);
}
}
use of org.apache.parquet.thrift.struct.ThriftType.StructType in project parquet-mr by apache.
the class ThriftMetaData method fromExtraMetaData.
/**
* Reads ThriftMetadata from the parquet file footer.
*
* @param extraMetaData extraMetaData field of the parquet footer
* @return the ThriftMetaData used to write a data file
*/
public static ThriftMetaData fromExtraMetaData(Map<String, String> extraMetaData) {
final String thriftClassName = extraMetaData.get(THRIFT_CLASS);
final String thriftDescriptorString = extraMetaData.get(THRIFT_DESCRIPTOR);
if (thriftClassName == null || thriftDescriptorString == null) {
return null;
}
final StructType descriptor = parseDescriptor(thriftDescriptorString);
return new ThriftMetaData(thriftClassName, descriptor);
}
use of org.apache.parquet.thrift.struct.ThriftType.StructType in project parquet-mr by apache.
the class TestThriftToPigCompatibility method validateSameTupleAsEB.
/**
* <ul> steps:
* <li>Writes using the thrift mapping
* <li>Reads using the pig mapping
* <li>Use Elephant bird to convert from thrift to pig
* <li>Check that both transformations give the same result
* @param o the object to convert
* @throws TException
*/
public static <T extends TBase<?, ?>> void validateSameTupleAsEB(T o) throws TException {
final ThriftSchemaConverter thriftSchemaConverter = new ThriftSchemaConverter();
@SuppressWarnings("unchecked") final Class<T> class1 = (Class<T>) o.getClass();
final MessageType schema = thriftSchemaConverter.convert(class1);
final StructType structType = ThriftSchemaConverter.toStructType(class1);
final ThriftToPig<T> thriftToPig = new ThriftToPig<T>(class1);
final Schema pigSchema = thriftToPig.toSchema();
final TupleRecordMaterializer tupleRecordConverter = new TupleRecordMaterializer(schema, pigSchema, true);
RecordConsumer recordConsumer = new ConverterConsumer(tupleRecordConverter.getRootConverter(), schema);
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
ParquetWriteProtocol p = new ParquetWriteProtocol(new RecordConsumerLoggingWrapper(recordConsumer), columnIO, structType);
o.write(p);
final Tuple t = tupleRecordConverter.getCurrentRecord();
final Tuple expected = thriftToPig.getPigTuple(o);
assertEquals(expected.toString(), t.toString());
final MessageType filtered = new PigSchemaConverter().filter(schema, pigSchema);
assertEquals(schema.toString(), filtered.toString());
}
use of org.apache.parquet.thrift.struct.ThriftType.StructType in project parquet-mr by apache.
the class TestThriftSchemaConverter method testToThriftType.
@Test
public void testToThriftType() throws Exception {
final StructType converted = ThriftSchemaConverter.toStructType(AddressBook.class);
final String json = converted.toJSON();
final ThriftType fromJSON = StructType.fromJSON(json);
assertEquals(json, fromJSON.toJSON());
}
use of org.apache.parquet.thrift.struct.ThriftType.StructType in project parquet-mr by apache.
the class TestThriftType method testParseUnionInfo.
@Test
public void testParseUnionInfo() throws Exception {
StructType st = (StructType) StructType.fromJSON("{\"id\": \"STRUCT\", \"children\":[], \"structOrUnionType\": \"UNION\"}");
assertEquals(st.getStructOrUnionType(), StructOrUnionType.UNION);
st = (StructType) StructType.fromJSON("{\"id\": \"STRUCT\", \"children\":[], \"structOrUnionType\": \"STRUCT\"}");
assertEquals(st.getStructOrUnionType(), StructOrUnionType.STRUCT);
st = (StructType) StructType.fromJSON("{\"id\": \"STRUCT\", \"children\":[]}");
assertEquals(st.getStructOrUnionType(), StructOrUnionType.UNKNOWN);
st = (StructType) StructType.fromJSON("{\"id\": \"STRUCT\", \"children\":[], \"structOrUnionType\": \"UNKNOWN\"}");
assertEquals(st.getStructOrUnionType(), StructOrUnionType.UNKNOWN);
}
Aggregations