use of org.apache.drill.exec.record.metadata.PrimitiveColumnMetadata in project drill by apache.
the class ColumnBuilder method buildRepeatedList.
private ColumnState buildRepeatedList(ContainerState parent, ColumnMetadata columnSchema) {
assert columnSchema.type() == MinorType.LIST;
assert columnSchema.mode() == DataMode.REPEATED;
// the element type after creating the repeated writer itself.
assert columnSchema.childSchema() == null;
// Build the repeated vector.
final RepeatedListVector vector = new RepeatedListVector(columnSchema.emptySchema(), parent.loader().allocator(), null);
// No inner type yet. The result set loader builds the subtype
// incrementally because it might be complex (a map or another
// repeated list.) To start, use a dummy to avoid need for if-statements
// everywhere.
final ColumnMetadata dummyElementSchema = new PrimitiveColumnMetadata(MaterializedField.create(columnSchema.name(), Types.repeated(MinorType.NULL)));
final AbstractObjectWriter dummyElement = ColumnWriterFactory.buildDummyColumnWriter(dummyElementSchema);
// Create the list writer: an array of arrays.
final AbstractObjectWriter arrayWriter = RepeatedListWriter.buildRepeatedList(columnSchema, vector, dummyElement);
// Create the list vector state that tracks the list vector lifecycle.
final RepeatedListVectorState vectorState = new RepeatedListVectorState(arrayWriter, vector);
// Build the container that tracks the array contents
final RepeatedListState listState = new RepeatedListState(parent.loader(), parent.vectorCache().childCache(columnSchema.name()));
// Bind the list state as the list event listener.
((RepeatedListWriter) arrayWriter.array()).bindListener(listState);
// propagate events down to the (one and only) child state.
return new RepeatedListColumnState(parent.loader(), arrayWriter, vectorState, listState);
}
use of org.apache.drill.exec.record.metadata.PrimitiveColumnMetadata in project drill by apache.
the class SchemaPathUtils method addColumnMetadata.
/**
* Adds column with specified schema path and type into specified {@code TupleMetadata schema}.
* For the case when specified {@link SchemaPath} has children, corresponding maps will be created
* in the {@code TupleMetadata schema} and the last child of the map will have specified type.
*
* @param schema tuple schema where column should be added
* @param schemaPath schema path of the column which should be added
* @param type type of the column which should be added
* @param types list of column's parent types
*/
public static void addColumnMetadata(TupleMetadata schema, SchemaPath schemaPath, TypeProtos.MajorType type, Map<SchemaPath, TypeProtos.MajorType> types) {
PathSegment.NameSegment colPath = schemaPath.getUnIndexed().getRootSegment();
List<String> names = new ArrayList<>(types.size());
// Used in case of LIST; defined here to avoid many instantiations inside while-loop
List<String> nextNames = new ArrayList<>(names.size());
ColumnMetadata colMetadata;
while (!colPath.isLastPath()) {
names.add(colPath.getPath());
colMetadata = schema.metadata(colPath.getPath());
TypeProtos.MajorType pathType = types.get(SchemaPath.getCompoundPath(names.toArray(new String[0])));
// The following types, DICT and LIST, contain a nested segment in Parquet representation
// (see ParquetReaderUtility#isLogicalListType(GroupType) and ParquetReaderUtility#isLogicalMapType(GroupType))
// which we should skip when creating corresponding TupleMetadata representation. Additionally,
// there is a need to track if the field is LIST to create appropriate column metadata based
// on the info: whether to create singular MAP/DICT or MAP/DICT array.
boolean isDict = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.DICT;
boolean isList = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.LIST;
String name = colPath.getPath();
if (isList) {
nextNames.clear();
nextNames.addAll(names);
// Parquet's LIST group (which represents an array) has
// an inner group (bagSegment) which we want to skip here
PathSegment.NameSegment bagSegment = colPath.getChild().getNameSegment();
PathSegment.NameSegment elementSegment = bagSegment.getChild().getNameSegment();
nextNames.add(bagSegment.getPath());
nextNames.add(elementSegment.getPath());
pathType = types.get(SchemaPath.getCompoundPath(nextNames.toArray(new String[0])));
if (pathType == null && colPath.getChild().getChild().isLastPath()) {
// will be handled after the while statement
break;
}
colPath = elementSegment;
names.add(bagSegment.getPath());
names.add(elementSegment.getPath());
// Check whether LIST's element type is DICT
isDict = pathType != null && pathType.getMinorType() == TypeProtos.MinorType.DICT;
}
if (colMetadata == null) {
if (isDict) {
colMetadata = isList ? MetadataUtils.newDictArray(name) : MetadataUtils.newDict(name);
} else {
colMetadata = isList ? MetadataUtils.newMapArray(name, null) : MetadataUtils.newMap(name, null);
}
schema.addColumn(colMetadata);
}
if (isDict) {
// Parquet's MAP (which corresponds to DICT in Drill) has
// an inner group which we want to skip here
colPath = (PathSegment.NameSegment) colPath.getChild();
names.add(colPath.getPath());
}
if (!colMetadata.isMap() && !colMetadata.isDict()) {
throw new DrillRuntimeException(String.format("Expected map or dict, but was %s", colMetadata.majorType()));
}
schema = colMetadata.tupleSchema();
colPath = (PathSegment.NameSegment) colPath.getChild();
}
colMetadata = schema.metadata(colPath.getPath());
if (colMetadata == null) {
schema.addColumn(new PrimitiveColumnMetadata(MaterializedField.create(colPath.getPath(), type)));
} else if (!colMetadata.majorType().equals(type)) {
throw new DrillRuntimeException(String.format("Types mismatch: existing type: %s, new type: %s", colMetadata.majorType(), type));
}
}
use of org.apache.drill.exec.record.metadata.PrimitiveColumnMetadata in project drill by axbaretto.
the class TestTupleSchema method testRequiredFixedWidthColumn.
/**
* Test a fixed-width, primitive, required column. Includes basic
* tests common to all data types. (Basic tests are not repeated for
* other types.)
*/
@Test
public void testRequiredFixedWidthColumn() {
MaterializedField field = SchemaBuilder.columnSchema("c", MinorType.INT, DataMode.REQUIRED);
ColumnMetadata col = MetadataUtils.fromField(field);
// Code may depend on the specific column class
assertTrue(col instanceof PrimitiveColumnMetadata);
// Generic checks
assertEquals(ColumnMetadata.StructureType.PRIMITIVE, col.structureType());
assertNull(col.mapSchema());
assertTrue(field.isEquivalent(col.schema()));
assertEquals(field.getName(), col.name());
assertEquals(field.getType().getMinorType(), col.type());
assertEquals(field.getDataMode(), col.mode());
assertFalse(col.isNullable());
assertFalse(col.isArray());
assertFalse(col.isVariableWidth());
assertFalse(col.isMap());
assertTrue(col.isEquivalent(col));
assertFalse(col.isVariant());
ColumnMetadata col2 = MetadataUtils.fromField(field);
assertTrue(col.isEquivalent(col2));
MaterializedField field3 = SchemaBuilder.columnSchema("d", MinorType.INT, DataMode.REQUIRED);
ColumnMetadata col3 = MetadataUtils.fromField(field3);
assertFalse(col.isEquivalent(col3));
MaterializedField field4 = SchemaBuilder.columnSchema("c", MinorType.BIGINT, DataMode.REQUIRED);
ColumnMetadata col4 = MetadataUtils.fromField(field4);
assertFalse(col.isEquivalent(col4));
MaterializedField field5 = SchemaBuilder.columnSchema("c", MinorType.INT, DataMode.OPTIONAL);
ColumnMetadata col5 = MetadataUtils.fromField(field5);
assertFalse(col.isEquivalent(col5));
ColumnMetadata col6 = col.cloneEmpty();
assertTrue(col.isEquivalent(col6));
assertEquals(4, col.expectedWidth());
col.setExpectedWidth(10);
assertEquals(4, col.expectedWidth());
assertEquals(1, col.expectedElementCount());
col.setExpectedElementCount(2);
assertEquals(1, col.expectedElementCount());
}
use of org.apache.drill.exec.record.metadata.PrimitiveColumnMetadata in project drill by apache.
the class TestInfoSchemaWithMetastore method testColumns.
@Test
public void testColumns() throws Exception {
BaseTableMetadata tableNoSchema = BaseTableMetadata.builder().tableInfo(TableInfo.builder().storagePlugin("dfs").workspace("tmp").name("table_no_schema").type("PARQUET").build()).metadataInfo(MetadataInfo.builder().type(MetadataType.TABLE).key(MetadataInfo.GENERAL_INFO_KEY).build()).location(new Path("/tmp", "table_no_schema")).metadataStatistics(Collections.emptyList()).columnsStatistics(Collections.emptyMap()).partitionKeys(Collections.emptyMap()).build();
TupleMetadata schema = new SchemaBuilder().addNullable("bigint_col", TypeProtos.MinorType.BIGINT).addDecimal("decimal_col", TypeProtos.MinorType.VARDECIMAL, TypeProtos.DataMode.OPTIONAL, 10, 2).add("interval_col", TypeProtos.MinorType.INTERVALYEAR).addArray("array_col", TypeProtos.MinorType.BIT).addMap("struct_col").addNullable("struct_bigint", TypeProtos.MinorType.BIGINT).add("struct_varchar", TypeProtos.MinorType.VARCHAR).addMap("nested_struct").addNullable("nested_struct_boolean", TypeProtos.MinorType.BIT).add("nested_struct_varchar", TypeProtos.MinorType.VARCHAR).resumeMap().resumeSchema().buildSchema();
PrimitiveColumnMetadata varcharCol = new PrimitiveColumnMetadata("varchar_col", TypeProtos.MajorType.newBuilder().setMinorType(TypeProtos.MinorType.VARCHAR).setMode(TypeProtos.DataMode.REQUIRED).build());
varcharCol.setDefaultValue("ABC");
PrimitiveColumnMetadata timestampColumn = new PrimitiveColumnMetadata("timestamp_col", TypeProtos.MajorType.newBuilder().setMinorType(TypeProtos.MinorType.TIMESTAMP).setMode(TypeProtos.DataMode.REQUIRED).build());
timestampColumn.setFormat("yyyy-MM-dd HH:mm:ss");
schema.addColumn(varcharCol);
schema.addColumn(timestampColumn);
Map<SchemaPath, ColumnStatistics<?>> columnsStatistics = new HashMap<>();
columnsStatistics.put(SchemaPath.parseFromString("varchar_col"), new ColumnStatistics<>(Arrays.asList(new StatisticsHolder<>("aaa", ColumnStatisticsKind.MIN_VALUE), new StatisticsHolder<>("zzz", ColumnStatisticsKind.MAX_VALUE))));
columnsStatistics.put(SchemaPath.parseFromString("struct_col.nested_struct.nested_struct_varchar"), new ColumnStatistics<>(Arrays.asList(new StatisticsHolder<>("bbb", ColumnStatisticsKind.MIN_VALUE), new StatisticsHolder<>("ccc", ColumnStatisticsKind.MAX_VALUE))));
columnsStatistics.put(SchemaPath.parseFromString("bigint_col"), new ColumnStatistics<>(Arrays.asList(new StatisticsHolder<>(100L, ColumnStatisticsKind.NULLS_COUNT), new StatisticsHolder<>(10.5D, ColumnStatisticsKind.NDV))));
columnsStatistics.put(SchemaPath.parseFromString("struct_col.struct_bigint"), new ColumnStatistics<>(Collections.singletonList(new StatisticsHolder<>(10.5D, ColumnStatisticsKind.NON_NULL_COUNT))));
ZonedDateTime currentTime = currentUtcTime();
String tableName = "table_with_schema";
BaseTableMetadata tableWithSchema = BaseTableMetadata.builder().tableInfo(TableInfo.builder().storagePlugin("dfs").workspace("tmp").name(tableName).type("PARQUET").build()).metadataInfo(MetadataInfo.builder().type(MetadataType.TABLE).key(MetadataInfo.GENERAL_INFO_KEY).build()).location(new Path("/tmp", tableName)).schema(schema).metadataStatistics(Collections.emptyList()).columnsStatistics(columnsStatistics).partitionKeys(Collections.emptyMap()).lastModifiedTime(currentTime.toInstant().toEpochMilli()).build();
metastore.tables().modify().overwrite(tableNoSchema.toMetadataUnit(), tableWithSchema.toMetadataUnit()).execute();
List<String> columns = Arrays.asList(InfoSchemaConstants.SHRD_COL_TABLE_CATALOG, InfoSchemaConstants.SHRD_COL_TABLE_SCHEMA, InfoSchemaConstants.SHRD_COL_TABLE_NAME, InfoSchemaConstants.COLS_COL_COLUMN_NAME, InfoSchemaConstants.COLS_COL_ORDINAL_POSITION, InfoSchemaConstants.COLS_COL_COLUMN_DEFAULT, InfoSchemaConstants.COLS_COL_IS_NULLABLE, InfoSchemaConstants.COLS_COL_DATA_TYPE, InfoSchemaConstants.COLS_COL_CHARACTER_MAXIMUM_LENGTH, InfoSchemaConstants.COLS_COL_CHARACTER_OCTET_LENGTH, InfoSchemaConstants.COLS_COL_NUMERIC_PRECISION, InfoSchemaConstants.COLS_COL_NUMERIC_PRECISION_RADIX, InfoSchemaConstants.COLS_COL_NUMERIC_SCALE, InfoSchemaConstants.COLS_COL_DATETIME_PRECISION, InfoSchemaConstants.COLS_COL_INTERVAL_TYPE, InfoSchemaConstants.COLS_COL_INTERVAL_PRECISION, InfoSchemaConstants.COLS_COL_COLUMN_SIZE, InfoSchemaConstants.COLS_COL_COLUMN_FORMAT, InfoSchemaConstants.COLS_COL_NUM_NULLS, InfoSchemaConstants.COLS_COL_MIN_VAL, InfoSchemaConstants.COLS_COL_MAX_VAL, InfoSchemaConstants.COLS_COL_NDV, InfoSchemaConstants.COLS_COL_EST_NUM_NON_NULLS, InfoSchemaConstants.COLS_COL_IS_NESTED);
client.testBuilder().sqlQuery("select %s from information_schema.`columns` where table_name " + "in ('%s', '%s')", String.join(", ", columns), tableNoSchema.getTableInfo().name(), tableName).unOrdered().baselineColumns(columns.toArray(new String[0])).baselineValues("DRILL", "dfs.tmp", tableName, "bigint_col", 1, null, "YES", "BIGINT", null, null, 0, 2, 0, null, null, null, 20, null, 100L, null, null, 10.5D, null, false).baselineValues("DRILL", "dfs.tmp", tableName, "decimal_col", 2, null, "YES", "DECIMAL", null, null, 10, 10, 2, null, null, null, 12, null, null, null, null, null, null, false).baselineValues("DRILL", "dfs.tmp", tableName, "interval_col", 3, null, "NO", "INTERVAL", null, null, null, null, null, null, "INTERVAL YEAR TO MONTH", 0, 9, null, null, null, null, null, null, false).baselineValues("DRILL", "dfs.tmp", tableName, "array_col", 4, null, "NO", "ARRAY", null, null, null, null, null, null, null, null, 0, null, null, null, null, null, null, false).baselineValues("DRILL", "dfs.tmp", tableName, "struct_col", 5, null, "NO", "STRUCT", null, null, null, null, null, null, null, null, 0, null, null, null, null, null, null, false).baselineValues("DRILL", "dfs.tmp", tableName, "struct_col.struct_bigint", 5, null, "YES", "BIGINT", null, null, 0, 2, 0, null, null, null, 20, null, null, null, null, null, 10.5D, true).baselineValues("DRILL", "dfs.tmp", tableName, "struct_col.struct_varchar", 5, null, "NO", "CHARACTER VARYING", 65535, 65535, null, null, null, null, null, null, 65535, null, null, null, null, null, null, true).baselineValues("DRILL", "dfs.tmp", tableName, "struct_col.nested_struct", 5, null, "NO", "STRUCT", null, null, null, null, null, null, null, null, 0, null, null, null, null, null, null, true).baselineValues("DRILL", "dfs.tmp", tableName, "struct_col.nested_struct.nested_struct_boolean", 5, null, "YES", "BOOLEAN", null, null, null, null, null, null, null, null, 1, null, null, null, null, null, null, true).baselineValues("DRILL", "dfs.tmp", tableName, "struct_col.nested_struct.nested_struct_varchar", 5, null, "NO", "CHARACTER VARYING", 65535, 65535, null, null, null, null, null, null, 65535, null, null, "bbb", "ccc", null, null, true).baselineValues("DRILL", "dfs.tmp", tableName, "varchar_col", 6, "ABC", "NO", "CHARACTER VARYING", 65535, 65535, null, null, null, null, null, null, 65535, null, null, "aaa", "zzz", null, null, false).baselineValues("DRILL", "dfs.tmp", tableName, "timestamp_col", 7, null, "NO", "TIMESTAMP", null, null, null, null, null, 19, null, null, 19, "yyyy-MM-dd HH:mm:ss", null, null, null, null, null, false).go();
}
use of org.apache.drill.exec.record.metadata.PrimitiveColumnMetadata in project drill by apache.
the class HiveUtilities method getArrayMetadata.
/**
* Returns {@link ColumnMetadata} instance which corresponds to specified array {@code RelDataType relDataType}.
*
* @param name name of the filed
* @param relDataType the source of type information to construct the schema
* @return {@link ColumnMetadata} instance
*/
private static ColumnMetadata getArrayMetadata(String name, RelDataType relDataType) {
RelDataType componentType = relDataType.getComponentType();
ColumnMetadata childColumnMetadata = getColumnMetadata(name, componentType);
switch(componentType.getSqlTypeName()) {
case ARRAY:
// for the case when nested type is array, it should be placed into repeated list
return MetadataUtils.newRepeatedList(name, childColumnMetadata);
case MAP:
case OTHER:
throw new UnsupportedOperationException(String.format("Unsupported data type: %s", relDataType.getSqlTypeName()));
default:
if (componentType.isStruct()) {
// for the case when nested type is struct, it should be placed into repeated map
return MetadataUtils.newMapArray(name, childColumnMetadata.tupleSchema());
} else {
// otherwise creates column metadata with repeated data mode
return new PrimitiveColumnMetadata(MaterializedField.create(name, Types.overrideMode(TypeInferenceUtils.getDrillMajorTypeFromCalciteType(componentType), DataMode.REPEATED)));
}
}
}
Aggregations