Search in sources :

Example 1 with StructType

use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.

the class CarbonTableOutputFormat method setFileHeader.

private static void setFileHeader(Configuration configuration, CarbonLoadModel model) throws IOException {
    StructType inputSchema = getInputSchema(configuration);
    if (inputSchema == null || inputSchema.getFields().size() == 0) {
        throw new UnsupportedOperationException("Input schema must be set");
    }
    List<StructField> fields = inputSchema.getFields();
    StringBuilder builder = new StringBuilder();
    String[] columns = new String[fields.size()];
    int i = 0;
    for (StructField field : fields) {
        builder.append(field.getFieldName());
        builder.append(",");
        columns[i++] = field.getFieldName();
    }
    String header = builder.toString();
    model.setCsvHeader(header.substring(0, header.length() - 1));
    model.setCsvHeaderColumns(columns);
}
Also used : StructField(org.apache.carbondata.core.metadata.datatype.StructField) StructType(org.apache.carbondata.core.metadata.datatype.StructType)

Example 2 with StructType

use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.

the class CSVCarbonWriterTest method testWritingAndReadingArrayStruct.

@Test
public void testWritingAndReadingArrayStruct() throws IOException {
    String path = "./testWriteFilesArrayStruct";
    FileUtils.deleteDirectory(new File(path));
    Field[] fields = new Field[4];
    fields[0] = new Field("id", DataTypes.STRING);
    fields[1] = new Field("source", DataTypes.STRING);
    fields[2] = new Field("usage", DataTypes.STRING);
    List<StructField> structFieldsList = new ArrayList<>();
    structFieldsList.add(new StructField("name", DataTypes.STRING));
    structFieldsList.add(new StructField("type", DataTypes.STRING));
    structFieldsList.add(new StructField("creation-time", DataTypes.STRING));
    structFieldsList.add(new StructField("property", DataTypes.STRING));
    StructField structTypeByList = new StructField("annotation", DataTypes.createStructType(structFieldsList), structFieldsList);
    List<StructField> list = new ArrayList<>();
    list.add(structTypeByList);
    Field arrayType = new Field("annotations", "array", list);
    fields[3] = arrayType;
    try {
        CarbonWriterBuilder builder = CarbonWriter.builder().taskNo(5).outputPath(path);
        CarbonWriter writer = builder.withCsvInput(new Schema(fields)).writtenBy("CSVCarbonWriterTest").build();
        for (int i = 0; i < 15; i++) {
            String[] row = new String[] { "robot" + (i % 10), String.valueOf(i), i + "." + i, "sunflowers" + (i % 10) + "\002" + "modelarts/image_classification" + "\002" + "2019-03-30 17:22:31" + "\002" + "{\"@modelarts:start_index\":0,\"@modelarts:end_index\":5}" + "\001" + "roses" + (i % 10) + "\002" + "modelarts/image_classification" + "\002" + "2019-03-30 17:22:32" + "\002" + "{\"@modelarts:start_index\":0,\"@modelarts:end_index\":5}" };
            writer.write(row);
        }
        writer.close();
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail();
    }
    Schema schema = CarbonSchemaReader.readSchema(path).asOriginOrder();
    assert (4 == schema.getFieldsLength());
    Field[] fields1 = schema.getFields();
    boolean flag = false;
    for (int i = 0; i < fields1.length; i++) {
        if (DataTypes.isArrayType(fields1[i].getDataType())) {
            ArrayType arrayType1 = (ArrayType) fields1[i].getDataType();
            assert ("annotations.annotation".equalsIgnoreCase(arrayType1.getElementName()));
            assert (DataTypes.isStructType(fields1[i].getChildren().get(0).getDataType()));
            assert (4 == (((StructType) fields1[i].getChildren().get(0).getDataType()).getFields()).size());
            flag = true;
        }
    }
    assert (flag);
    // Read again
    CarbonReader reader = null;
    try {
        reader = CarbonReader.builder(path).projection(new String[] { "id", "source", "usage", "annotations" }).build();
        int i = 0;
        while (reader.hasNext()) {
            Object[] row = (Object[]) reader.readNextRow();
            assert (4 == row.length);
            assert (((String) row[0]).contains("robot"));
            int value = Integer.valueOf((String) row[1]);
            Float value2 = Float.valueOf((String) row[2]);
            assert (value > -1 || value < 15);
            assert (value2 > -1 || value2 < 15);
            Object[] annotations = (Object[]) row[3];
            for (int j = 0; j < annotations.length; j++) {
                Object[] annotation = (Object[]) annotations[j];
                assert (((String) annotation[0]).contains("sunflowers") || ((String) annotation[0]).contains("roses"));
                assert (((String) annotation[1]).contains("modelarts/image_classification"));
                assert (((String) annotation[2]).contains("2019-03-30 17:22:3"));
                assert (((String) annotation[3]).contains("{\"@modelarts:start_index\":0,\"@modelarts:end_index\":5}"));
                Object[] annotation1 = readObjects(annotations, j);
                assert (((String) annotation1[0]).contains("sunflowers") || ((String) annotation1[0]).contains("roses"));
                assert (((String) annotation1[1]).contains("modelarts/image_classification"));
                assert (((String) annotation1[2]).contains("2019-03-30 17:22:3"));
                assert (((String) annotation1[3]).contains("{\"@modelarts:start_index\":0,\"@modelarts:end_index\":5}"));
            }
            i++;
        }
        assert (15 == i);
        reader.close();
    } catch (InterruptedException e) {
        e.printStackTrace();
    } finally {
        FileUtils.deleteDirectory(new File(path));
    }
}
Also used : StructType(org.apache.carbondata.core.metadata.datatype.StructType) ColumnSchema(org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema) ArrayList(java.util.ArrayList) IOException(java.io.IOException) InvalidLoadOptionException(org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException) ArrayType(org.apache.carbondata.core.metadata.datatype.ArrayType) Field(org.apache.carbondata.core.metadata.datatype.Field) StructField(org.apache.carbondata.core.metadata.datatype.StructField) StructField(org.apache.carbondata.core.metadata.datatype.StructField) CarbonFile(org.apache.carbondata.core.datastore.filesystem.CarbonFile) File(java.io.File) Test(org.junit.Test)

Example 3 with StructType

use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.

the class ArrowUtils method toArrowField.

public static org.apache.arrow.vector.types.pojo.Field toArrowField(String name, DataType dataType, String timeZoneId) {
    if (dataType instanceof ArrayType) {
        // instance of check is for findbugs, instead of datatypes check
        FieldType fieldType = new FieldType(true, ArrowType.List.INSTANCE, null);
        List<org.apache.arrow.vector.types.pojo.Field> structFields = new ArrayList<>();
        DataType elementType = ((ArrayType) dataType).getElementType();
        structFields.add(toArrowField("element", elementType, timeZoneId));
        return new org.apache.arrow.vector.types.pojo.Field(name, fieldType, structFields);
    } else if (dataType instanceof StructType) {
        // instance of check is for findbugs, instead of datatypes check
        StructType dataType1 = (StructType) dataType;
        FieldType fieldType = new FieldType(true, ArrowType.Struct.INSTANCE, null);
        List<StructField> fields = dataType1.getFields();
        List<org.apache.arrow.vector.types.pojo.Field> structFields = new ArrayList<>();
        for (int i = 0; i < fields.size(); i++) {
            structFields.add(toArrowField(fields.get(i).getFieldName(), fields.get(i).getDataType(), timeZoneId));
        }
        return new org.apache.arrow.vector.types.pojo.Field(name, fieldType, structFields);
    } else {
        FieldType fieldType = new FieldType(true, toArrowType(dataType, timeZoneId), null);
        return new org.apache.arrow.vector.types.pojo.Field(name, fieldType, new ArrayList<org.apache.arrow.vector.types.pojo.Field>());
    }
}
Also used : StructType(org.apache.carbondata.core.metadata.datatype.StructType) ArrayList(java.util.ArrayList) FieldType(org.apache.arrow.vector.types.pojo.FieldType) ArrayType(org.apache.carbondata.core.metadata.datatype.ArrayType) StructField(org.apache.carbondata.core.metadata.datatype.StructField) Field(org.apache.carbondata.core.metadata.datatype.Field) DataType(org.apache.carbondata.core.metadata.datatype.DataType) ArrayList(java.util.ArrayList) List(java.util.List)

Example 4 with StructType

use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.

the class TableSchemaBuilder method addColumn.

private ColumnSchema addColumn(StructField field, String parentName, AtomicInteger valIndex, boolean isSortColumn, boolean isComplexChild, boolean isInvertedIdxColumn) {
    Objects.requireNonNull(field);
    if (isComplexChild) {
        // if field is complex then append parent name to the child field to check
        // if any other field with same name exists
        checkRepeatColumnName(field, parentName);
    } else {
        checkRepeatColumnName(field);
    }
    ColumnSchema newColumn = new ColumnSchema();
    if (parentName != null) {
        newColumn.setColumnName(parentName + "." + field.getFieldName());
    } else {
        newColumn.setColumnName(field.getFieldName());
    }
    newColumn.setDataType(field.getDataType());
    newColumn.setDimensionColumn(isSortColumn || field.getDataType() == DataTypes.STRING || field.getDataType() == DataTypes.VARCHAR || field.getDataType() == DataTypes.DATE || field.getDataType() == DataTypes.TIMESTAMP || field.getDataType() == DataTypes.BINARY || field.getDataType().isComplexType() || (isComplexChild));
    if (!isComplexChild) {
        newColumn.setSchemaOrdinal(ordinal++);
    } else {
        // child column should not be counted for schema ordinal
        newColumn.setSchemaOrdinal(-1);
    }
    // For NonTransactionalTable, multiple sdk writer output with same column name can be placed in
    // single folder for query.
    // That time many places in code, columnId check will fail. To avoid that
    // keep column ID as same as column name.
    // Anyhow Alter table is not supported for NonTransactionalTable.
    // SO, this will not have any impact.
    newColumn.setColumnUniqueId(field.getFieldName());
    newColumn.setColumnReferenceId(newColumn.getColumnUniqueId());
    newColumn.setEncodingList(createEncoding(field.getDataType(), isInvertedIdxColumn, isComplexChild));
    if (field.getDataType().isComplexType()) {
        if (DataTypes.isArrayType(field.getDataType()) || DataTypes.isMapType(field.getDataType())) {
            newColumn.setNumberOfChild(1);
        } else {
            newColumn.setNumberOfChild(((StructType) field.getDataType()).getFields().size());
        }
    }
    if (DataTypes.isDecimal(field.getDataType())) {
        DecimalType decimalType = (DecimalType) field.getDataType();
        newColumn.setPrecision(decimalType.getPrecision());
        newColumn.setScale(decimalType.getScale());
    }
    if (!isSortColumn) {
        if (!newColumn.isDimensionColumn()) {
            measures.add(newColumn);
        } else if (DataTypes.isStructType(field.getDataType()) || DataTypes.isArrayType(field.getDataType()) || DataTypes.isMapType(field.getDataType()) || isComplexChild) {
            complex.add(newColumn);
        } else {
            if (field.getDataType() == DataTypes.VARCHAR) {
                varCharColumns.add(newColumn);
            } else {
                dimension.add(newColumn);
            }
        }
    } else {
        newColumn.setSortColumn(true);
        sortColumns.add(newColumn);
    }
    if (field.getDataType().isComplexType()) {
        String parentFieldName = newColumn.getColumnName();
        if (DataTypes.isArrayType(field.getDataType())) {
            for (StructField structField : field.getChildren()) {
                String colName = getColNameForArray(valIndex);
                if (null != ((ArrayType) field.getDataType()).getElementName()) {
                    colName = ((ArrayType) field.getDataType()).getElementName();
                }
                structField.setFieldName(colName);
                addColumn(structField, parentFieldName, valIndex, false, true, isInvertedIdxColumn);
            }
        } else if (DataTypes.isStructType(field.getDataType()) && ((StructType) field.getDataType()).getFields().size() > 0) {
            // This field has children.
            if (field.getChildren() != null) {
                for (StructField structField : field.getChildren()) {
                    addColumn(structField, parentFieldName, valIndex, false, true, isInvertedIdxColumn);
                }
            }
        } else if (DataTypes.isMapType(field.getDataType())) {
            for (StructField structField : field.getChildren()) {
                structField.setFieldName(getColNameForArray(valIndex));
                addColumn(structField, parentFieldName, valIndex, false, true, isInvertedIdxColumn);
            }
        }
    }
    // todo: need more information such as long_string_columns
    return newColumn;
}
Also used : StructField(org.apache.carbondata.core.metadata.datatype.StructField) StructType(org.apache.carbondata.core.metadata.datatype.StructType) DecimalType(org.apache.carbondata.core.metadata.datatype.DecimalType) ColumnSchema(org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema)

Example 5 with StructType

use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.

the class AvroCarbonWriter method prepareFields.

private static Field prepareFields(Schema.Field avroField) {
    String fieldName = avroField.name();
    Schema childSchema = avroField.schema();
    Schema.Type type = childSchema.getType();
    LogicalType logicalType = childSchema.getLogicalType();
    switch(type) {
        case BOOLEAN:
            return new Field(fieldName, DataTypes.BOOLEAN);
        case INT:
            if (logicalType instanceof LogicalTypes.Date) {
                return new Field(fieldName, DataTypes.DATE);
            } else {
                // which will be mapped to carbon as INT data type
                return new Field(fieldName, DataTypes.INT);
            }
        case LONG:
            if (logicalType instanceof LogicalTypes.TimestampMillis || logicalType instanceof LogicalTypes.TimestampMicros) {
                return new Field(fieldName, DataTypes.TIMESTAMP);
            } else {
                // which will be mapped to carbon as LONG data type
                return new Field(fieldName, DataTypes.LONG);
            }
        case DOUBLE:
            return new Field(fieldName, DataTypes.DOUBLE);
        case ENUM:
        case STRING:
            return new Field(fieldName, DataTypes.STRING);
        case FLOAT:
            return new Field(fieldName, DataTypes.FLOAT);
        case MAP:
            // recursively get the sub fields
            ArrayList<StructField> mapSubFields = new ArrayList<>();
            StructField mapField = prepareSubFields(fieldName, childSchema);
            if (null != mapField) {
                // key value field will be wrapped inside a map struct field
                StructField keyValueField = mapField.getChildren().get(0);
                // value dataType will be at position 1 in the fields
                DataType valueType = ((StructType) keyValueField.getDataType()).getFields().get(1).getDataType();
                MapType mapType = DataTypes.createMapType(DataTypes.STRING, valueType);
                mapSubFields.add(keyValueField);
                return new Field(fieldName, mapType, mapSubFields);
            }
            return null;
        case RECORD:
            // recursively get the sub fields
            ArrayList<StructField> structSubFields = new ArrayList<>();
            for (Schema.Field avroSubField : childSchema.getFields()) {
                StructField structField = prepareSubFields(avroSubField.name(), avroSubField.schema());
                if (structField != null) {
                    structSubFields.add(structField);
                }
            }
            return new Field(fieldName, "struct", structSubFields);
        case ARRAY:
            // recursively get the sub fields
            ArrayList<StructField> arraySubField = new ArrayList<>();
            // array will have only one sub field.
            StructField structField = prepareSubFields(fieldName, childSchema.getElementType());
            if (structField != null) {
                arraySubField.add(structField);
                return new Field(fieldName, "array", arraySubField);
            } else {
                return null;
            }
        case UNION:
            int i = 0;
            // Get union types and store as Struct<type>
            ArrayList<StructField> unionFields = new ArrayList<>();
            for (Schema avroSubField : avroField.schema().getTypes()) {
                if (!avroSubField.getType().equals(Schema.Type.NULL)) {
                    StructField unionField = prepareSubFields(avroField.name() + i++, avroSubField);
                    if (unionField != null) {
                        unionFields.add(unionField);
                    }
                }
            }
            if (unionFields.isEmpty()) {
                throw new UnsupportedOperationException("Carbon do not support Avro UNION with only null type");
            }
            return new Field(fieldName, "struct", unionFields);
        case BYTES:
            // set to "decimal" and a specified precision and scale
            if (logicalType instanceof LogicalTypes.Decimal) {
                int precision = ((LogicalTypes.Decimal) childSchema.getLogicalType()).getPrecision();
                int scale = ((LogicalTypes.Decimal) childSchema.getLogicalType()).getScale();
                return new Field(fieldName, DataTypes.createDecimalType(precision, scale));
            } else {
                throw new UnsupportedOperationException("carbon not support " + type.toString() + " avro type yet");
            }
        case NULL:
            return null;
        default:
            throw new UnsupportedOperationException("carbon not support " + type.toString() + " avro type yet");
    }
}
Also used : StructType(org.apache.carbondata.core.metadata.datatype.StructType) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) LogicalType(org.apache.avro.LogicalType) LogicalTypes(org.apache.avro.LogicalTypes) MapType(org.apache.carbondata.core.metadata.datatype.MapType) StructField(org.apache.carbondata.core.metadata.datatype.StructField) Field(org.apache.carbondata.core.metadata.datatype.Field) StructField(org.apache.carbondata.core.metadata.datatype.StructField) BigDecimal(java.math.BigDecimal) DataType(org.apache.carbondata.core.metadata.datatype.DataType)

Aggregations

StructField (org.apache.carbondata.core.metadata.datatype.StructField)5 StructType (org.apache.carbondata.core.metadata.datatype.StructType)5 ArrayList (java.util.ArrayList)3 Field (org.apache.carbondata.core.metadata.datatype.Field)3 ArrayType (org.apache.carbondata.core.metadata.datatype.ArrayType)2 DataType (org.apache.carbondata.core.metadata.datatype.DataType)2 ColumnSchema (org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema)2 File (java.io.File)1 IOException (java.io.IOException)1 BigDecimal (java.math.BigDecimal)1 List (java.util.List)1 FieldType (org.apache.arrow.vector.types.pojo.FieldType)1 LogicalType (org.apache.avro.LogicalType)1 LogicalTypes (org.apache.avro.LogicalTypes)1 Schema (org.apache.avro.Schema)1 InvalidLoadOptionException (org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException)1 CarbonFile (org.apache.carbondata.core.datastore.filesystem.CarbonFile)1 DecimalType (org.apache.carbondata.core.metadata.datatype.DecimalType)1 MapType (org.apache.carbondata.core.metadata.datatype.MapType)1 Test (org.junit.Test)1