use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.
the class VectorizedParquetRecordReader method buildVectorizedParquetReader.
// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, boolean skipTimestampConversion, ZoneId writerTimezone, boolean skipProlepticConversion, boolean legacyConversionEnabled, int depth) throws IOException {
List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
switch(typeInfo.getCategory()) {
case PRIMITIVE:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
if (fileSchema.getColumns().contains(descriptors.get(0))) {
return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, type, typeInfo);
} else {
// Support for schema evolution
return new VectorizedDummyColumnReader();
}
case STRUCT:
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
List<Type> types = type.asGroupType().getFields();
for (int i = 0; i < fieldTypes.size(); i++) {
VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, depth + 1);
if (r != null) {
fieldReaders.add(r);
} else {
throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
}
}
return new VectorizedStructColumnReader(fieldReaders);
case LIST:
checkListColumnSupport(((ListTypeInfo) typeInfo).getListElementTypeInfo());
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
return new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, getElementType(type), typeInfo);
case MAP:
if (columnDescriptors == null || columnDescriptors.isEmpty()) {
throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
}
// to handle the different Map definition in Parquet, eg:
// definition has 1 group:
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// definition has 2 groups:
// optional group m1 (MAP) {
// repeated group map (MAP_KEY_VALUE)
// {required binary key (UTF8); optional binary value (UTF8);}
// }
int nestGroup = 0;
GroupType groupType = type.asGroupType();
// otherwise, continue to get the group type until MAP_DEFINITION_LEVEL_MAX.
while (groupType.getFieldCount() < 2) {
if (nestGroup > MAP_DEFINITION_LEVEL_MAX) {
throw new RuntimeException("More than " + MAP_DEFINITION_LEVEL_MAX + " level is found in Map definition, " + "Failed to get the field types for Map with type " + type);
}
groupType = groupType.getFields().get(0).asGroupType();
nestGroup++;
}
List<Type> kvTypes = groupType.getFields();
VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(0), typeInfo);
VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader(descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(1), typeInfo);
return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
case UNION:
default:
throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
}
}
use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.
the class JsonSerDe method initialize.
/**
* Initialize the SerDe.
*
* @param conf System properties; can be null in compile time
* @param tbl table properties
* @param writeablePrimitivesDeserialize true if outputs are Hadoop Writable
*/
private void initialize(final Configuration conf, final Properties tbl, final boolean writeablePrimitivesDeserialize) {
log.debug("Initializing JsonSerDe: {}", tbl.entrySet());
final String nullEmpty = tbl.getProperty(NULL_EMPTY_LINES, "false");
this.nullEmptyLines = Boolean.parseBoolean(nullEmpty);
this.rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(getColumnNames(), getColumnTypes());
this.soi = (StructObjectInspector) TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(this.rowTypeInfo);
final TimestampParser tsParser;
final String parserFormats = tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS);
if (parserFormats != null) {
tsParser = new TimestampParser(HiveStringUtils.splitAndUnEscape(parserFormats));
} else {
tsParser = new TimestampParser();
}
final String binaryEncodingStr = tbl.getProperty(BINARY_FORMAT, "base64");
this.binaryEncoding = BinaryEncoding.valueOf(binaryEncodingStr.toUpperCase());
this.jsonReader = new HiveJsonReader(this.soi, tsParser);
this.jsonWriter = new HiveJsonWriter(this.binaryEncoding, getColumnNames());
this.jsonReader.setBinaryEncoding(binaryEncoding);
this.jsonReader.enable(HiveJsonReader.Feature.COL_INDEX_PARSING);
if (writeablePrimitivesDeserialize) {
this.jsonReader.enable(HiveJsonReader.Feature.PRIMITIVE_TO_WRITABLE);
}
final String ignoreExtras = tbl.getProperty(IGNORE_EXTRA, "true");
if (Boolean.parseBoolean(ignoreExtras)) {
this.jsonReader.enable(HiveJsonReader.Feature.IGNORE_UNKNOWN_FIELDS);
}
final String stringifyComplex = tbl.getProperty(STRINGIFY_COMPLEX, "true");
if (Boolean.parseBoolean(stringifyComplex)) {
this.jsonReader.enable(HiveJsonReader.Feature.STRINGIFY_COMPLEX_FIELDS);
}
log.debug("Initialized SerDe {}", this);
log.debug("JSON Struct Reader: {}", jsonReader);
log.debug("JSON Struct Writer: {}", jsonWriter);
}
use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.
the class ConstantVectorExpression method setStructValue.
public void setStructValue(Object structValue) throws HiveException {
StructTypeInfo structTypeInfo = (StructTypeInfo) outputTypeInfo;
List<TypeInfo> fieldTypeInfoList = structTypeInfo.getAllStructFieldTypeInfos();
final int size = fieldTypeInfoList.size();
this.structValue = new ConstantVectorExpression[size];
List<Object> fieldValueList = (List<Object>) structValue;
for (int i = 0; i < size; i++) {
this.structValue[i] = create(i, fieldValueList.get(i), fieldTypeInfoList.get(i));
}
}
use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.
the class TestNewInputOutputFormat method testNewOutputFormat.
@Test
public // Test regular outputformat
void testNewOutputFormat() throws Exception {
int rownum = 1000;
Path inputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".txt");
Path outputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
localFs.delete(outputPath, true);
PrintWriter pw = new PrintWriter(new OutputStreamWriter(localFs.create(inputPath)));
Random r = new Random(1000L);
boolean firstRow = true;
int firstIntValue = 0;
String firstStringValue = null;
for (int i = 0; i < rownum; i++) {
int intValue = r.nextInt();
String stringValue = UUID.randomUUID().toString();
if (firstRow) {
firstRow = false;
firstIntValue = intValue;
firstStringValue = stringValue;
}
pw.println(intValue + "," + stringValue);
}
pw.close();
Job job = new Job(conf, "orc test");
job.setOutputFormatClass(OrcNewOutputFormat.class);
job.setJarByClass(TestNewInputOutputFormat.class);
job.setMapperClass(OrcTestMapper2.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Writable.class);
FileInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
boolean result = job.waitForCompletion(true);
assertTrue(result);
Path outputFilePath = new Path(outputPath, "part-m-00000");
assertTrue(localFs.exists(outputFilePath));
Reader reader = OrcFile.createReader(outputFilePath, OrcFile.readerOptions(conf).filesystem(localFs));
assertTrue(reader.getNumberOfRows() == rownum);
assertEquals(reader.getCompression(), CompressionKind.ZLIB);
StructObjectInspector soi = (StructObjectInspector) reader.getObjectInspector();
StructTypeInfo ti = (StructTypeInfo) TypeInfoUtils.getTypeInfoFromObjectInspector(soi);
assertEquals(((PrimitiveTypeInfo) ti.getAllStructFieldTypeInfos().get(0)).getPrimitiveCategory(), PrimitiveObjectInspector.PrimitiveCategory.INT);
assertEquals(((PrimitiveTypeInfo) ti.getAllStructFieldTypeInfos().get(1)).getPrimitiveCategory(), PrimitiveObjectInspector.PrimitiveCategory.STRING);
RecordReader rows = reader.rows();
Object row = rows.next(null);
IntWritable intWritable = (IntWritable) soi.getStructFieldData(row, soi.getAllStructFieldRefs().get(0));
Text text = (Text) soi.getStructFieldData(row, soi.getAllStructFieldRefs().get(1));
assertEquals(intWritable.get(), firstIntValue);
assertEquals(text.toString(), firstStringValue);
rows.close();
localFs.delete(outputPath, true);
}
use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.
the class VectorizedColumnReaderTestBase method createStructObjectInspector.
private static StructObjectInspector createStructObjectInspector(Configuration conf) {
// Create row related objects
String columnNames = conf.get(IOConstants.COLUMNS);
List<String> columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
String columnTypes = conf.get(IOConstants.COLUMNS_TYPES);
List<TypeInfo> columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
TypeInfo rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList);
return new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);
}
Aggregations