use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.
the class CarbonTableOutputFormat method setFileHeader.
private static void setFileHeader(Configuration configuration, CarbonLoadModel model) throws IOException {
StructType inputSchema = getInputSchema(configuration);
if (inputSchema == null || inputSchema.getFields().size() == 0) {
throw new UnsupportedOperationException("Input schema must be set");
}
List<StructField> fields = inputSchema.getFields();
StringBuilder builder = new StringBuilder();
String[] columns = new String[fields.size()];
int i = 0;
for (StructField field : fields) {
builder.append(field.getFieldName());
builder.append(",");
columns[i++] = field.getFieldName();
}
String header = builder.toString();
model.setCsvHeader(header.substring(0, header.length() - 1));
model.setCsvHeaderColumns(columns);
}
use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.
the class CSVCarbonWriterTest method testWritingAndReadingArrayStruct.
@Test
public void testWritingAndReadingArrayStruct() throws IOException {
String path = "./testWriteFilesArrayStruct";
FileUtils.deleteDirectory(new File(path));
Field[] fields = new Field[4];
fields[0] = new Field("id", DataTypes.STRING);
fields[1] = new Field("source", DataTypes.STRING);
fields[2] = new Field("usage", DataTypes.STRING);
List<StructField> structFieldsList = new ArrayList<>();
structFieldsList.add(new StructField("name", DataTypes.STRING));
structFieldsList.add(new StructField("type", DataTypes.STRING));
structFieldsList.add(new StructField("creation-time", DataTypes.STRING));
structFieldsList.add(new StructField("property", DataTypes.STRING));
StructField structTypeByList = new StructField("annotation", DataTypes.createStructType(structFieldsList), structFieldsList);
List<StructField> list = new ArrayList<>();
list.add(structTypeByList);
Field arrayType = new Field("annotations", "array", list);
fields[3] = arrayType;
try {
CarbonWriterBuilder builder = CarbonWriter.builder().taskNo(5).outputPath(path);
CarbonWriter writer = builder.withCsvInput(new Schema(fields)).writtenBy("CSVCarbonWriterTest").build();
for (int i = 0; i < 15; i++) {
String[] row = new String[] { "robot" + (i % 10), String.valueOf(i), i + "." + i, "sunflowers" + (i % 10) + "\002" + "modelarts/image_classification" + "\002" + "2019-03-30 17:22:31" + "\002" + "{\"@modelarts:start_index\":0,\"@modelarts:end_index\":5}" + "\001" + "roses" + (i % 10) + "\002" + "modelarts/image_classification" + "\002" + "2019-03-30 17:22:32" + "\002" + "{\"@modelarts:start_index\":0,\"@modelarts:end_index\":5}" };
writer.write(row);
}
writer.close();
} catch (Exception e) {
e.printStackTrace();
Assert.fail();
}
Schema schema = CarbonSchemaReader.readSchema(path).asOriginOrder();
assert (4 == schema.getFieldsLength());
Field[] fields1 = schema.getFields();
boolean flag = false;
for (int i = 0; i < fields1.length; i++) {
if (DataTypes.isArrayType(fields1[i].getDataType())) {
ArrayType arrayType1 = (ArrayType) fields1[i].getDataType();
assert ("annotations.annotation".equalsIgnoreCase(arrayType1.getElementName()));
assert (DataTypes.isStructType(fields1[i].getChildren().get(0).getDataType()));
assert (4 == (((StructType) fields1[i].getChildren().get(0).getDataType()).getFields()).size());
flag = true;
}
}
assert (flag);
// Read again
CarbonReader reader = null;
try {
reader = CarbonReader.builder(path).projection(new String[] { "id", "source", "usage", "annotations" }).build();
int i = 0;
while (reader.hasNext()) {
Object[] row = (Object[]) reader.readNextRow();
assert (4 == row.length);
assert (((String) row[0]).contains("robot"));
int value = Integer.valueOf((String) row[1]);
Float value2 = Float.valueOf((String) row[2]);
assert (value > -1 || value < 15);
assert (value2 > -1 || value2 < 15);
Object[] annotations = (Object[]) row[3];
for (int j = 0; j < annotations.length; j++) {
Object[] annotation = (Object[]) annotations[j];
assert (((String) annotation[0]).contains("sunflowers") || ((String) annotation[0]).contains("roses"));
assert (((String) annotation[1]).contains("modelarts/image_classification"));
assert (((String) annotation[2]).contains("2019-03-30 17:22:3"));
assert (((String) annotation[3]).contains("{\"@modelarts:start_index\":0,\"@modelarts:end_index\":5}"));
Object[] annotation1 = readObjects(annotations, j);
assert (((String) annotation1[0]).contains("sunflowers") || ((String) annotation1[0]).contains("roses"));
assert (((String) annotation1[1]).contains("modelarts/image_classification"));
assert (((String) annotation1[2]).contains("2019-03-30 17:22:3"));
assert (((String) annotation1[3]).contains("{\"@modelarts:start_index\":0,\"@modelarts:end_index\":5}"));
}
i++;
}
assert (15 == i);
reader.close();
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
FileUtils.deleteDirectory(new File(path));
}
}
use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.
the class ArrowUtils method toArrowField.
public static org.apache.arrow.vector.types.pojo.Field toArrowField(String name, DataType dataType, String timeZoneId) {
if (dataType instanceof ArrayType) {
// instance of check is for findbugs, instead of datatypes check
FieldType fieldType = new FieldType(true, ArrowType.List.INSTANCE, null);
List<org.apache.arrow.vector.types.pojo.Field> structFields = new ArrayList<>();
DataType elementType = ((ArrayType) dataType).getElementType();
structFields.add(toArrowField("element", elementType, timeZoneId));
return new org.apache.arrow.vector.types.pojo.Field(name, fieldType, structFields);
} else if (dataType instanceof StructType) {
// instance of check is for findbugs, instead of datatypes check
StructType dataType1 = (StructType) dataType;
FieldType fieldType = new FieldType(true, ArrowType.Struct.INSTANCE, null);
List<StructField> fields = dataType1.getFields();
List<org.apache.arrow.vector.types.pojo.Field> structFields = new ArrayList<>();
for (int i = 0; i < fields.size(); i++) {
structFields.add(toArrowField(fields.get(i).getFieldName(), fields.get(i).getDataType(), timeZoneId));
}
return new org.apache.arrow.vector.types.pojo.Field(name, fieldType, structFields);
} else {
FieldType fieldType = new FieldType(true, toArrowType(dataType, timeZoneId), null);
return new org.apache.arrow.vector.types.pojo.Field(name, fieldType, new ArrayList<org.apache.arrow.vector.types.pojo.Field>());
}
}
use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.
the class TableSchemaBuilder method addColumn.
private ColumnSchema addColumn(StructField field, String parentName, AtomicInteger valIndex, boolean isSortColumn, boolean isComplexChild, boolean isInvertedIdxColumn) {
Objects.requireNonNull(field);
if (isComplexChild) {
// if field is complex then append parent name to the child field to check
// if any other field with same name exists
checkRepeatColumnName(field, parentName);
} else {
checkRepeatColumnName(field);
}
ColumnSchema newColumn = new ColumnSchema();
if (parentName != null) {
newColumn.setColumnName(parentName + "." + field.getFieldName());
} else {
newColumn.setColumnName(field.getFieldName());
}
newColumn.setDataType(field.getDataType());
newColumn.setDimensionColumn(isSortColumn || field.getDataType() == DataTypes.STRING || field.getDataType() == DataTypes.VARCHAR || field.getDataType() == DataTypes.DATE || field.getDataType() == DataTypes.TIMESTAMP || field.getDataType() == DataTypes.BINARY || field.getDataType().isComplexType() || (isComplexChild));
if (!isComplexChild) {
newColumn.setSchemaOrdinal(ordinal++);
} else {
// child column should not be counted for schema ordinal
newColumn.setSchemaOrdinal(-1);
}
// For NonTransactionalTable, multiple sdk writer output with same column name can be placed in
// single folder for query.
// That time many places in code, columnId check will fail. To avoid that
// keep column ID as same as column name.
// Anyhow Alter table is not supported for NonTransactionalTable.
// SO, this will not have any impact.
newColumn.setColumnUniqueId(field.getFieldName());
newColumn.setColumnReferenceId(newColumn.getColumnUniqueId());
newColumn.setEncodingList(createEncoding(field.getDataType(), isInvertedIdxColumn, isComplexChild));
if (field.getDataType().isComplexType()) {
if (DataTypes.isArrayType(field.getDataType()) || DataTypes.isMapType(field.getDataType())) {
newColumn.setNumberOfChild(1);
} else {
newColumn.setNumberOfChild(((StructType) field.getDataType()).getFields().size());
}
}
if (DataTypes.isDecimal(field.getDataType())) {
DecimalType decimalType = (DecimalType) field.getDataType();
newColumn.setPrecision(decimalType.getPrecision());
newColumn.setScale(decimalType.getScale());
}
if (!isSortColumn) {
if (!newColumn.isDimensionColumn()) {
measures.add(newColumn);
} else if (DataTypes.isStructType(field.getDataType()) || DataTypes.isArrayType(field.getDataType()) || DataTypes.isMapType(field.getDataType()) || isComplexChild) {
complex.add(newColumn);
} else {
if (field.getDataType() == DataTypes.VARCHAR) {
varCharColumns.add(newColumn);
} else {
dimension.add(newColumn);
}
}
} else {
newColumn.setSortColumn(true);
sortColumns.add(newColumn);
}
if (field.getDataType().isComplexType()) {
String parentFieldName = newColumn.getColumnName();
if (DataTypes.isArrayType(field.getDataType())) {
for (StructField structField : field.getChildren()) {
String colName = getColNameForArray(valIndex);
if (null != ((ArrayType) field.getDataType()).getElementName()) {
colName = ((ArrayType) field.getDataType()).getElementName();
}
structField.setFieldName(colName);
addColumn(structField, parentFieldName, valIndex, false, true, isInvertedIdxColumn);
}
} else if (DataTypes.isStructType(field.getDataType()) && ((StructType) field.getDataType()).getFields().size() > 0) {
// This field has children.
if (field.getChildren() != null) {
for (StructField structField : field.getChildren()) {
addColumn(structField, parentFieldName, valIndex, false, true, isInvertedIdxColumn);
}
}
} else if (DataTypes.isMapType(field.getDataType())) {
for (StructField structField : field.getChildren()) {
structField.setFieldName(getColNameForArray(valIndex));
addColumn(structField, parentFieldName, valIndex, false, true, isInvertedIdxColumn);
}
}
}
// todo: need more information such as long_string_columns
return newColumn;
}
use of org.apache.carbondata.core.metadata.datatype.StructType in project carbondata by apache.
the class AvroCarbonWriter method prepareFields.
private static Field prepareFields(Schema.Field avroField) {
String fieldName = avroField.name();
Schema childSchema = avroField.schema();
Schema.Type type = childSchema.getType();
LogicalType logicalType = childSchema.getLogicalType();
switch(type) {
case BOOLEAN:
return new Field(fieldName, DataTypes.BOOLEAN);
case INT:
if (logicalType instanceof LogicalTypes.Date) {
return new Field(fieldName, DataTypes.DATE);
} else {
// which will be mapped to carbon as INT data type
return new Field(fieldName, DataTypes.INT);
}
case LONG:
if (logicalType instanceof LogicalTypes.TimestampMillis || logicalType instanceof LogicalTypes.TimestampMicros) {
return new Field(fieldName, DataTypes.TIMESTAMP);
} else {
// which will be mapped to carbon as LONG data type
return new Field(fieldName, DataTypes.LONG);
}
case DOUBLE:
return new Field(fieldName, DataTypes.DOUBLE);
case ENUM:
case STRING:
return new Field(fieldName, DataTypes.STRING);
case FLOAT:
return new Field(fieldName, DataTypes.FLOAT);
case MAP:
// recursively get the sub fields
ArrayList<StructField> mapSubFields = new ArrayList<>();
StructField mapField = prepareSubFields(fieldName, childSchema);
if (null != mapField) {
// key value field will be wrapped inside a map struct field
StructField keyValueField = mapField.getChildren().get(0);
// value dataType will be at position 1 in the fields
DataType valueType = ((StructType) keyValueField.getDataType()).getFields().get(1).getDataType();
MapType mapType = DataTypes.createMapType(DataTypes.STRING, valueType);
mapSubFields.add(keyValueField);
return new Field(fieldName, mapType, mapSubFields);
}
return null;
case RECORD:
// recursively get the sub fields
ArrayList<StructField> structSubFields = new ArrayList<>();
for (Schema.Field avroSubField : childSchema.getFields()) {
StructField structField = prepareSubFields(avroSubField.name(), avroSubField.schema());
if (structField != null) {
structSubFields.add(structField);
}
}
return new Field(fieldName, "struct", structSubFields);
case ARRAY:
// recursively get the sub fields
ArrayList<StructField> arraySubField = new ArrayList<>();
// array will have only one sub field.
StructField structField = prepareSubFields(fieldName, childSchema.getElementType());
if (structField != null) {
arraySubField.add(structField);
return new Field(fieldName, "array", arraySubField);
} else {
return null;
}
case UNION:
int i = 0;
// Get union types and store as Struct<type>
ArrayList<StructField> unionFields = new ArrayList<>();
for (Schema avroSubField : avroField.schema().getTypes()) {
if (!avroSubField.getType().equals(Schema.Type.NULL)) {
StructField unionField = prepareSubFields(avroField.name() + i++, avroSubField);
if (unionField != null) {
unionFields.add(unionField);
}
}
}
if (unionFields.isEmpty()) {
throw new UnsupportedOperationException("Carbon do not support Avro UNION with only null type");
}
return new Field(fieldName, "struct", unionFields);
case BYTES:
// set to "decimal" and a specified precision and scale
if (logicalType instanceof LogicalTypes.Decimal) {
int precision = ((LogicalTypes.Decimal) childSchema.getLogicalType()).getPrecision();
int scale = ((LogicalTypes.Decimal) childSchema.getLogicalType()).getScale();
return new Field(fieldName, DataTypes.createDecimalType(precision, scale));
} else {
throw new UnsupportedOperationException("carbon not support " + type.toString() + " avro type yet");
}
case NULL:
return null;
default:
throw new UnsupportedOperationException("carbon not support " + type.toString() + " avro type yet");
}
}
Aggregations