use of org.apache.druid.data.input.impl.DimensionSchema in project druid by druid-io.
the class KafkaSupervisorTest method getDataSchema.
private static DataSchema getDataSchema(String dataSource) {
List<DimensionSchema> dimensions = new ArrayList<>();
dimensions.add(StringDimensionSchema.create("dim1"));
dimensions.add(StringDimensionSchema.create("dim2"));
return new DataSchema(dataSource, new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(dimensions), new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, ImmutableList.of()), null);
}
use of org.apache.druid.data.input.impl.DimensionSchema in project hive by apache.
the class DruidStorageHandlerUtils method getDimensionsAndAggregates.
public static Pair<List<DimensionSchema>, AggregatorFactory[]> getDimensionsAndAggregates(List<String> columnNames, List<TypeInfo> columnTypes) {
// Default, all columns that are not metrics or timestamp, are treated as dimensions
final List<DimensionSchema> dimensions = new ArrayList<>();
ImmutableList.Builder<AggregatorFactory> aggregatorFactoryBuilder = ImmutableList.builder();
for (int i = 0; i < columnTypes.size(); i++) {
final PrimitiveObjectInspector.PrimitiveCategory primitiveCategory = ((PrimitiveTypeInfo) columnTypes.get(i)).getPrimitiveCategory();
AggregatorFactory af;
switch(primitiveCategory) {
case BYTE:
case SHORT:
case INT:
case LONG:
af = new LongSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
break;
case FLOAT:
af = new FloatSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
break;
case DOUBLE:
af = new DoubleSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
break;
case DECIMAL:
throw new UnsupportedOperationException(String.format("Druid does not support decimal column type cast column " + "[%s] to double", columnNames.get(i)));
case TIMESTAMP:
// Granularity column
String tColumnName = columnNames.get(i);
if (!tColumnName.equals(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME) && !tColumnName.equals(DruidConstants.DEFAULT_TIMESTAMP_COLUMN)) {
throw new IllegalArgumentException("Dimension " + tColumnName + " does not have STRING type: " + primitiveCategory);
}
continue;
case TIMESTAMPLOCALTZ:
// Druid timestamp column
String tLocalTZColumnName = columnNames.get(i);
if (!tLocalTZColumnName.equals(DruidConstants.DEFAULT_TIMESTAMP_COLUMN)) {
throw new IllegalArgumentException("Dimension " + tLocalTZColumnName + " does not have STRING type: " + primitiveCategory);
}
continue;
default:
// Dimension
String dColumnName = columnNames.get(i);
if (PrimitiveObjectInspectorUtils.getPrimitiveGrouping(primitiveCategory) != PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP && primitiveCategory != PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN) {
throw new IllegalArgumentException("Dimension " + dColumnName + " does not have STRING type: " + primitiveCategory);
}
dimensions.add(new StringDimensionSchema(dColumnName));
continue;
}
aggregatorFactoryBuilder.add(af);
}
ImmutableList<AggregatorFactory> aggregatorFactories = aggregatorFactoryBuilder.build();
return Pair.of(dimensions, aggregatorFactories.toArray(new AggregatorFactory[0]));
}
use of org.apache.druid.data.input.impl.DimensionSchema in project druid by druid-io.
the class GroupByLimitPushDownMultiNodeMergeTest method setup.
@Before
public void setup() throws Exception {
tmpDir = FileUtils.createTempDir();
InputRow row;
List<String> dimNames = Arrays.asList("dimA", "metA");
Map<String, Object> event;
final IncrementalIndex indexA = makeIncIndex(false);
incrementalIndices.add(indexA);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("metA", 2395L);
row = new MapBasedInputRow(1505260888888L, dimNames, event);
indexA.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("metA", 8L);
row = new MapBasedInputRow(1505260800000L, dimNames, event);
indexA.add(row);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("metA", 5028L);
row = new MapBasedInputRow(1505264400000L, dimNames, event);
indexA.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("metA", 7L);
row = new MapBasedInputRow(1505264400400L, dimNames, event);
indexA.add(row);
final File fileA = INDEX_MERGER_V9.persist(indexA, new File(tmpDir, "A"), new IndexSpec(), null);
QueryableIndex qindexA = INDEX_IO.loadIndex(fileA);
final IncrementalIndex indexB = makeIncIndex(false);
incrementalIndices.add(indexB);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("metA", 4718L);
row = new MapBasedInputRow(1505260800000L, dimNames, event);
indexB.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("metA", 18L);
row = new MapBasedInputRow(1505260800000L, dimNames, event);
indexB.add(row);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("metA", 2698L);
row = new MapBasedInputRow(1505264400000L, dimNames, event);
indexB.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("metA", 3L);
row = new MapBasedInputRow(1505264400000L, dimNames, event);
indexB.add(row);
final File fileB = INDEX_MERGER_V9.persist(indexB, new File(tmpDir, "B"), new IndexSpec(), null);
QueryableIndex qindexB = INDEX_IO.loadIndex(fileB);
final IncrementalIndex indexC = makeIncIndex(false);
incrementalIndices.add(indexC);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("metA", 2395L);
row = new MapBasedInputRow(1505260800000L, dimNames, event);
indexC.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("metA", 8L);
row = new MapBasedInputRow(1605260800000L, dimNames, event);
indexC.add(row);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("metA", 5028L);
row = new MapBasedInputRow(1705264400000L, dimNames, event);
indexC.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("metA", 7L);
row = new MapBasedInputRow(1805264400000L, dimNames, event);
indexC.add(row);
final File fileC = INDEX_MERGER_V9.persist(indexC, new File(tmpDir, "C"), new IndexSpec(), null);
QueryableIndex qindexC = INDEX_IO.loadIndex(fileC);
final IncrementalIndex indexD = makeIncIndex(false);
incrementalIndices.add(indexD);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("metA", 4718L);
row = new MapBasedInputRow(1505260800000L, dimNames, event);
indexD.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("metA", 18L);
row = new MapBasedInputRow(1605260800000L, dimNames, event);
indexD.add(row);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("metA", 2698L);
row = new MapBasedInputRow(1705264400000L, dimNames, event);
indexD.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("metA", 3L);
row = new MapBasedInputRow(1805264400000L, dimNames, event);
indexD.add(row);
final File fileD = INDEX_MERGER_V9.persist(indexD, new File(tmpDir, "D"), new IndexSpec(), null);
QueryableIndex qindexD = INDEX_IO.loadIndex(fileD);
List<String> dimNames2 = Arrays.asList("dimA", "dimB", "metA");
List<DimensionSchema> dimensions = Arrays.asList(new StringDimensionSchema("dimA"), new StringDimensionSchema("dimB"), new LongDimensionSchema("metA"));
final IncrementalIndex indexE = makeIncIndex(false, dimensions);
incrementalIndices.add(indexE);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("dimB", "raw");
event.put("metA", 5L);
row = new MapBasedInputRow(1505260800000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("dimB", "ripe");
event.put("metA", 9L);
row = new MapBasedInputRow(1605260800000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "pomegranate");
event.put("dimB", "raw");
event.put("metA", 3L);
row = new MapBasedInputRow(1705264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "mango");
event.put("dimB", "ripe");
event.put("metA", 7L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "grape");
event.put("dimB", "raw");
event.put("metA", 5L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "apple");
event.put("dimB", "ripe");
event.put("metA", 3L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "apple");
event.put("dimB", "raw");
event.put("metA", 1L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "apple");
event.put("dimB", "ripe");
event.put("metA", 4L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "apple");
event.put("dimB", "raw");
event.put("metA", 1L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "banana");
event.put("dimB", "ripe");
event.put("metA", 4L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "orange");
event.put("dimB", "raw");
event.put("metA", 9L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "peach");
event.put("dimB", "ripe");
event.put("metA", 7L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "orange");
event.put("dimB", "raw");
event.put("metA", 2L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
event = new HashMap<>();
event.put("dimA", "strawberry");
event.put("dimB", "ripe");
event.put("metA", 10L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexE.add(row);
final File fileE = INDEX_MERGER_V9.persist(indexE, new File(tmpDir, "E"), new IndexSpec(), null);
QueryableIndex qindexE = INDEX_IO.loadIndex(fileE);
final IncrementalIndex indexF = makeIncIndex(false, dimensions);
incrementalIndices.add(indexF);
event = new HashMap<>();
event.put("dimA", "kiwi");
event.put("dimB", "raw");
event.put("metA", 7L);
row = new MapBasedInputRow(1505260800000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "watermelon");
event.put("dimB", "ripe");
event.put("metA", 14L);
row = new MapBasedInputRow(1605260800000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "kiwi");
event.put("dimB", "raw");
event.put("metA", 8L);
row = new MapBasedInputRow(1705264400000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "kiwi");
event.put("dimB", "ripe");
event.put("metA", 8L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "lemon");
event.put("dimB", "raw");
event.put("metA", 3L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "cherry");
event.put("dimB", "ripe");
event.put("metA", 2L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "cherry");
event.put("dimB", "raw");
event.put("metA", 7L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "avocado");
event.put("dimB", "ripe");
event.put("metA", 12L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "cherry");
event.put("dimB", "raw");
event.put("metA", 3L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "plum");
event.put("dimB", "ripe");
event.put("metA", 5L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "plum");
event.put("dimB", "raw");
event.put("metA", 3L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexF.add(row);
event = new HashMap<>();
event.put("dimA", "lime");
event.put("dimB", "ripe");
event.put("metA", 7L);
row = new MapBasedInputRow(1805264400000L, dimNames2, event);
indexF.add(row);
final File fileF = INDEX_MERGER_V9.persist(indexF, new File(tmpDir, "F"), new IndexSpec(), null);
QueryableIndex qindexF = INDEX_IO.loadIndex(fileF);
groupByIndices = Arrays.asList(qindexA, qindexB, qindexC, qindexD, qindexE, qindexF);
resourceCloser = Closer.create();
setupGroupByFactory();
}
use of org.apache.druid.data.input.impl.DimensionSchema in project druid by druid-io.
the class DruidParquetAvroReadSupport method getPartialReadSchema.
/**
* Select the columns from the parquet schema that are used in the schema of the ingestion job
*
* @param context The context of the file to be read
*
* @return the partial schema that only contains the columns that are being used in the schema
*/
private MessageType getPartialReadSchema(InitContext context) {
MessageType fullSchema = context.getFileSchema();
String name = fullSchema.getName();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
ParseSpec parseSpec = config.getParser().getParseSpec();
if (parseSpec instanceof AvroParseSpec) {
if (((AvroParseSpec) parseSpec).getFlattenSpec() != null) {
return fullSchema;
}
}
String tsField = config.getParser().getParseSpec().getTimestampSpec().getTimestampColumn();
List<DimensionSchema> dimensionSchema = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
Set<String> dimensions = new HashSet<>();
for (DimensionSchema dim : dimensionSchema) {
dimensions.add(dim.getName());
}
Set<String> metricsFields = new HashSet<>();
for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
metricsFields.addAll(agg.requiredFields());
}
List<Type> partialFields = new ArrayList<>();
for (Type type : fullSchema.getFields()) {
if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
partialFields.add(type);
}
}
return new MessageType(name, partialFields);
}
use of org.apache.druid.data.input.impl.DimensionSchema in project druid by druid-io.
the class DruidParquetReadSupport method getPartialReadSchema.
/**
* Select the columns from the parquet schema that are used in the schema of the ingestion job
*
* @param context The context of the file to be read
*
* @return the partial schema that only contains the columns that are being used in the schema
*/
private MessageType getPartialReadSchema(InitContext context) {
MessageType fullSchema = context.getFileSchema();
String name = fullSchema.getName();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
ParseSpec parseSpec = config.getParser().getParseSpec();
// parse the flatten spec and determine it isn't auto discovering props?
if (parseSpec instanceof ParquetParseSpec) {
if (((ParquetParseSpec) parseSpec).getFlattenSpec() != null) {
return fullSchema;
}
}
String tsField = parseSpec.getTimestampSpec().getTimestampColumn();
List<DimensionSchema> dimensionSchema = parseSpec.getDimensionsSpec().getDimensions();
Set<String> dimensions = new HashSet<>();
for (DimensionSchema dim : dimensionSchema) {
dimensions.add(dim.getName());
}
Set<String> metricsFields = new HashSet<>();
for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
metricsFields.addAll(agg.requiredFields());
}
List<Type> partialFields = new ArrayList<>();
for (Type type : fullSchema.getFields()) {
if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
partialFields.add(type);
}
}
return new MessageType(name, partialFields);
}
Aggregations