Search in sources :

Example 81 with StructTypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.

the class VectorizedParquetRecordReader method buildVectorizedParquetReader.

// Build VectorizedParquetColumnReader via Hive typeInfo and Parquet schema
private VectorizedColumnReader buildVectorizedParquetReader(TypeInfo typeInfo, Type type, PageReadStore pages, List<ColumnDescriptor> columnDescriptors, boolean skipTimestampConversion, ZoneId writerTimezone, boolean skipProlepticConversion, boolean legacyConversionEnabled, int depth) throws IOException {
    List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors);
    switch(typeInfo.getCategory()) {
        case PRIMITIVE:
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            }
            if (fileSchema.getColumns().contains(descriptors.get(0))) {
                return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, type, typeInfo);
            } else {
                // Support for schema evolution
                return new VectorizedDummyColumnReader();
            }
        case STRUCT:
            StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
            List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
            List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
            List<Type> types = type.asGroupType().getFields();
            for (int i = 0; i < fieldTypes.size(); i++) {
                VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, depth + 1);
                if (r != null) {
                    fieldReaders.add(r);
                } else {
                    throw new RuntimeException("Fail to build Parquet vectorized reader based on Hive type " + fieldTypes.get(i).getTypeName() + " and Parquet type" + types.get(i).toString());
                }
            }
            return new VectorizedStructColumnReader(fieldReaders);
        case LIST:
            checkListColumnSupport(((ListTypeInfo) typeInfo).getListElementTypeInfo());
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            }
            return new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, getElementType(type), typeInfo);
        case MAP:
            if (columnDescriptors == null || columnDescriptors.isEmpty()) {
                throw new RuntimeException("Failed to find related Parquet column descriptor with type " + type);
            }
            // to handle the different Map definition in Parquet, eg:
            // definition has 1 group:
            // repeated group map (MAP_KEY_VALUE)
            // {required binary key (UTF8); optional binary value (UTF8);}
            // definition has 2 groups:
            // optional group m1 (MAP) {
            // repeated group map (MAP_KEY_VALUE)
            // {required binary key (UTF8); optional binary value (UTF8);}
            // }
            int nestGroup = 0;
            GroupType groupType = type.asGroupType();
            // otherwise, continue to get the group type until MAP_DEFINITION_LEVEL_MAX.
            while (groupType.getFieldCount() < 2) {
                if (nestGroup > MAP_DEFINITION_LEVEL_MAX) {
                    throw new RuntimeException("More than " + MAP_DEFINITION_LEVEL_MAX + " level is found in Map definition, " + "Failed to get the field types for Map with type " + type);
                }
                groupType = groupType.getFields().get(0).asGroupType();
                nestGroup++;
            }
            List<Type> kvTypes = groupType.getFields();
            VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(0), typeInfo);
            VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader(descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(1), typeInfo);
            return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
        case UNION:
        default:
            throw new RuntimeException("Unsupported category " + typeInfo.getCategory().name());
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) PrimitiveType(org.apache.parquet.schema.PrimitiveType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) ParquetRuntimeException(org.apache.parquet.ParquetRuntimeException) GroupType(org.apache.parquet.schema.GroupType)

Example 82 with StructTypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.

the class JsonSerDe method initialize.

/**
 * Initialize the SerDe.
 *
 * @param conf System properties; can be null in compile time
 * @param tbl table properties
 * @param writeablePrimitivesDeserialize true if outputs are Hadoop Writable
 */
private void initialize(final Configuration conf, final Properties tbl, final boolean writeablePrimitivesDeserialize) {
    log.debug("Initializing JsonSerDe: {}", tbl.entrySet());
    final String nullEmpty = tbl.getProperty(NULL_EMPTY_LINES, "false");
    this.nullEmptyLines = Boolean.parseBoolean(nullEmpty);
    this.rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(getColumnNames(), getColumnTypes());
    this.soi = (StructObjectInspector) TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(this.rowTypeInfo);
    final TimestampParser tsParser;
    final String parserFormats = tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS);
    if (parserFormats != null) {
        tsParser = new TimestampParser(HiveStringUtils.splitAndUnEscape(parserFormats));
    } else {
        tsParser = new TimestampParser();
    }
    final String binaryEncodingStr = tbl.getProperty(BINARY_FORMAT, "base64");
    this.binaryEncoding = BinaryEncoding.valueOf(binaryEncodingStr.toUpperCase());
    this.jsonReader = new HiveJsonReader(this.soi, tsParser);
    this.jsonWriter = new HiveJsonWriter(this.binaryEncoding, getColumnNames());
    this.jsonReader.setBinaryEncoding(binaryEncoding);
    this.jsonReader.enable(HiveJsonReader.Feature.COL_INDEX_PARSING);
    if (writeablePrimitivesDeserialize) {
        this.jsonReader.enable(HiveJsonReader.Feature.PRIMITIVE_TO_WRITABLE);
    }
    final String ignoreExtras = tbl.getProperty(IGNORE_EXTRA, "true");
    if (Boolean.parseBoolean(ignoreExtras)) {
        this.jsonReader.enable(HiveJsonReader.Feature.IGNORE_UNKNOWN_FIELDS);
    }
    final String stringifyComplex = tbl.getProperty(STRINGIFY_COMPLEX, "true");
    if (Boolean.parseBoolean(stringifyComplex)) {
        this.jsonReader.enable(HiveJsonReader.Feature.STRINGIFY_COMPLEX_FIELDS);
    }
    log.debug("Initialized SerDe {}", this);
    log.debug("JSON Struct Reader: {}", jsonReader);
    log.debug("JSON Struct Writer: {}", jsonWriter);
}
Also used : HiveJsonReader(org.apache.hadoop.hive.serde2.json.HiveJsonReader) TimestampParser(org.apache.hive.common.util.TimestampParser) HiveJsonWriter(org.apache.hadoop.hive.serde2.json.HiveJsonWriter)

Example 83 with StructTypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.

the class ConstantVectorExpression method setStructValue.

public void setStructValue(Object structValue) throws HiveException {
    StructTypeInfo structTypeInfo = (StructTypeInfo) outputTypeInfo;
    List<TypeInfo> fieldTypeInfoList = structTypeInfo.getAllStructFieldTypeInfos();
    final int size = fieldTypeInfoList.size();
    this.structValue = new ConstantVectorExpression[size];
    List<Object> fieldValueList = (List<Object>) structValue;
    for (int i = 0; i < size; i++) {
        this.structValue[i] = create(i, fieldValueList.get(i), fieldTypeInfoList.get(i));
    }
}
Also used : StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) ArrayList(java.util.ArrayList) List(java.util.List) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)

Example 84 with StructTypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.

the class TestNewInputOutputFormat method testNewOutputFormat.

@Test
public // Test regular outputformat
void testNewOutputFormat() throws Exception {
    int rownum = 1000;
    Path inputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".txt");
    Path outputPath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
    localFs.delete(outputPath, true);
    PrintWriter pw = new PrintWriter(new OutputStreamWriter(localFs.create(inputPath)));
    Random r = new Random(1000L);
    boolean firstRow = true;
    int firstIntValue = 0;
    String firstStringValue = null;
    for (int i = 0; i < rownum; i++) {
        int intValue = r.nextInt();
        String stringValue = UUID.randomUUID().toString();
        if (firstRow) {
            firstRow = false;
            firstIntValue = intValue;
            firstStringValue = stringValue;
        }
        pw.println(intValue + "," + stringValue);
    }
    pw.close();
    Job job = new Job(conf, "orc test");
    job.setOutputFormatClass(OrcNewOutputFormat.class);
    job.setJarByClass(TestNewInputOutputFormat.class);
    job.setMapperClass(OrcTestMapper2.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Writable.class);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    boolean result = job.waitForCompletion(true);
    assertTrue(result);
    Path outputFilePath = new Path(outputPath, "part-m-00000");
    assertTrue(localFs.exists(outputFilePath));
    Reader reader = OrcFile.createReader(outputFilePath, OrcFile.readerOptions(conf).filesystem(localFs));
    assertTrue(reader.getNumberOfRows() == rownum);
    assertEquals(reader.getCompression(), CompressionKind.ZLIB);
    StructObjectInspector soi = (StructObjectInspector) reader.getObjectInspector();
    StructTypeInfo ti = (StructTypeInfo) TypeInfoUtils.getTypeInfoFromObjectInspector(soi);
    assertEquals(((PrimitiveTypeInfo) ti.getAllStructFieldTypeInfos().get(0)).getPrimitiveCategory(), PrimitiveObjectInspector.PrimitiveCategory.INT);
    assertEquals(((PrimitiveTypeInfo) ti.getAllStructFieldTypeInfos().get(1)).getPrimitiveCategory(), PrimitiveObjectInspector.PrimitiveCategory.STRING);
    RecordReader rows = reader.rows();
    Object row = rows.next(null);
    IntWritable intWritable = (IntWritable) soi.getStructFieldData(row, soi.getAllStructFieldRefs().get(0));
    Text text = (Text) soi.getStructFieldData(row, soi.getAllStructFieldRefs().get(1));
    assertEquals(intWritable.get(), firstIntValue);
    assertEquals(text.toString(), firstStringValue);
    rows.close();
    localFs.delete(outputPath, true);
}
Also used : Path(org.apache.hadoop.fs.Path) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) Text(org.apache.hadoop.io.Text) Random(java.util.Random) OutputStreamWriter(java.io.OutputStreamWriter) Job(org.apache.hadoop.mapreduce.Job) IntWritable(org.apache.hadoop.io.IntWritable) PrintWriter(java.io.PrintWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 85 with StructTypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo in project hive by apache.

the class VectorizedColumnReaderTestBase method createStructObjectInspector.

private static StructObjectInspector createStructObjectInspector(Configuration conf) {
    // Create row related objects
    String columnNames = conf.get(IOConstants.COLUMNS);
    List<String> columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = conf.get(IOConstants.COLUMNS_TYPES);
    List<TypeInfo> columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    TypeInfo rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList);
    return new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo);
}
Also used : ArrayWritableObjectInspector(org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)

Aggregations

StructTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo)100 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)78 ListTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo)59 MapTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo)54 PrimitiveTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)54 ArrayList (java.util.ArrayList)42 UnionTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo)32 DecimalTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo)30 CharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo)24 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)23 List (java.util.List)21 VarcharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo)21 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)17 IntWritable (org.apache.hadoop.io.IntWritable)12 Text (org.apache.hadoop.io.Text)12 BytesWritable (org.apache.hadoop.io.BytesWritable)11 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)10 Category (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category)10 PrimitiveObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector)10 BooleanWritable (org.apache.hadoop.io.BooleanWritable)10