Search in sources :

Example 31 with StringDimensionSchema

use of org.apache.druid.data.input.impl.StringDimensionSchema in project druid by druid-io.

the class CompactionTaskTest method setupClass.

@BeforeClass
public static void setupClass() {
    MIXED_TYPE_COLUMN_MAP.put(Intervals.of("2017-01-01/2017-02-01"), new StringDimensionSchema(MIXED_TYPE_COLUMN));
    MIXED_TYPE_COLUMN_MAP.put(Intervals.of("2017-02-01/2017-03-01"), new StringDimensionSchema(MIXED_TYPE_COLUMN));
    MIXED_TYPE_COLUMN_MAP.put(Intervals.of("2017-03-01/2017-04-01"), new StringDimensionSchema(MIXED_TYPE_COLUMN));
    MIXED_TYPE_COLUMN_MAP.put(Intervals.of("2017-04-01/2017-05-01"), new StringDimensionSchema(MIXED_TYPE_COLUMN));
    MIXED_TYPE_COLUMN_MAP.put(Intervals.of("2017-05-01/2017-06-01"), new DoubleDimensionSchema(MIXED_TYPE_COLUMN));
    MIXED_TYPE_COLUMN_MAP.put(Intervals.of("2017-06-01/2017-07-01"), new DoubleDimensionSchema(MIXED_TYPE_COLUMN));
    MIXED_TYPE_COLUMN_MAP.put(Intervals.of("2017-06-01/2017-06-02"), new DoubleDimensionSchema(MIXED_TYPE_COLUMN));
    MIXED_TYPE_COLUMN_MAP.put(Intervals.of("2017-06-15/2017-06-16"), new DoubleDimensionSchema(MIXED_TYPE_COLUMN));
    MIXED_TYPE_COLUMN_MAP.put(Intervals.of("2017-06-30/2017-07-01"), new DoubleDimensionSchema(MIXED_TYPE_COLUMN));
    DIMENSIONS = new HashMap<>();
    AGGREGATORS = new ArrayList<>();
    DIMENSIONS.put(ColumnHolder.TIME_COLUMN_NAME, new LongDimensionSchema(ColumnHolder.TIME_COLUMN_NAME));
    DIMENSIONS.put(TIMESTAMP_COLUMN, new LongDimensionSchema(TIMESTAMP_COLUMN));
    int numUmbrellaIntervals = 6;
    for (int i = 0; i < numUmbrellaIntervals; i++) {
        final StringDimensionSchema schema = new StringDimensionSchema("string_dim_" + i, null, null);
        DIMENSIONS.put(schema.getName(), schema);
    }
    for (int i = 0; i < numUmbrellaIntervals; i++) {
        final LongDimensionSchema schema = new LongDimensionSchema("long_dim_" + i);
        DIMENSIONS.put(schema.getName(), schema);
    }
    for (int i = 0; i < numUmbrellaIntervals; i++) {
        final FloatDimensionSchema schema = new FloatDimensionSchema("float_dim_" + i);
        DIMENSIONS.put(schema.getName(), schema);
    }
    for (int i = 0; i < numUmbrellaIntervals; i++) {
        final DoubleDimensionSchema schema = new DoubleDimensionSchema("double_dim_" + i);
        DIMENSIONS.put(schema.getName(), schema);
    }
    AGGREGATORS.add(new CountAggregatorFactory("agg_0"));
    AGGREGATORS.add(new LongSumAggregatorFactory("agg_1", "long_dim_1"));
    AGGREGATORS.add(new LongMaxAggregatorFactory("agg_2", "long_dim_2"));
    AGGREGATORS.add(new FloatFirstAggregatorFactory("agg_3", "float_dim_3", null));
    AGGREGATORS.add(new DoubleLastAggregatorFactory("agg_4", "double_dim_4", null));
    for (int i = 0; i < SEGMENT_INTERVALS.size(); i++) {
        SEGMENT_MAP.put(new DataSegment(DATA_SOURCE, SEGMENT_INTERVALS.get(i), "version_" + i, ImmutableMap.of(), findDimensions(i, SEGMENT_INTERVALS.get(i)), AGGREGATORS.stream().map(AggregatorFactory::getName).collect(Collectors.toList()), new NumberedShardSpec(0, 1), 0, SEGMENT_SIZE_BYTES), new File("file_" + i));
    }
    SEGMENTS = new ArrayList<>(SEGMENT_MAP.keySet());
}
Also used : FloatFirstAggregatorFactory(org.apache.druid.query.aggregation.first.FloatFirstAggregatorFactory) LongDimensionSchema(org.apache.druid.data.input.impl.LongDimensionSchema) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) FloatDimensionSchema(org.apache.druid.data.input.impl.FloatDimensionSchema) DataSegment(org.apache.druid.timeline.DataSegment) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) DoubleLastAggregatorFactory(org.apache.druid.query.aggregation.last.DoubleLastAggregatorFactory) DoubleDimensionSchema(org.apache.druid.data.input.impl.DoubleDimensionSchema) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) LongMaxAggregatorFactory(org.apache.druid.query.aggregation.LongMaxAggregatorFactory) File(java.io.File) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) BeforeClass(org.junit.BeforeClass)

Example 32 with StringDimensionSchema

use of org.apache.druid.data.input.impl.StringDimensionSchema in project druid by druid-io.

the class GroupByQueryHelper method createIndexAccumulatorPair.

public static <T> Pair<IncrementalIndex, Accumulator<IncrementalIndex, T>> createIndexAccumulatorPair(final GroupByQuery query, @Nullable final GroupByQuery subquery, final GroupByQueryConfig config) {
    final GroupByQueryConfig querySpecificConfig = config.withOverrides(query);
    final Granularity gran = query.getGranularity();
    final long timeStart = query.getIntervals().get(0).getStartMillis();
    final boolean combine = subquery == null;
    long granTimeStart = timeStart;
    if (!(Granularities.ALL.equals(gran))) {
        granTimeStart = gran.bucketStart(timeStart);
    }
    final List<AggregatorFactory> aggs;
    if (combine) {
        aggs = Lists.transform(query.getAggregatorSpecs(), new Function<AggregatorFactory, AggregatorFactory>() {

            @Override
            public AggregatorFactory apply(AggregatorFactory input) {
                return input.getCombiningFactory();
            }
        });
    } else {
        aggs = query.getAggregatorSpecs();
    }
    final List<String> dimensions = Lists.transform(query.getDimensions(), new Function<DimensionSpec, String>() {

        @Override
        public String apply(DimensionSpec input) {
            return input.getOutputName();
        }
    });
    final IncrementalIndex index;
    final boolean sortResults = query.getContextValue(CTX_KEY_SORT_RESULTS, true);
    // All groupBy dimensions are strings, for now.
    final List<DimensionSchema> dimensionSchemas = new ArrayList<>();
    for (DimensionSpec dimension : query.getDimensions()) {
        dimensionSchemas.add(new StringDimensionSchema(dimension.getOutputName()));
    }
    final IncrementalIndexSchema indexSchema = new IncrementalIndexSchema.Builder().withDimensionsSpec(new DimensionsSpec(dimensionSchemas)).withMetrics(aggs.toArray(new AggregatorFactory[0])).withQueryGranularity(gran).withMinTimestamp(granTimeStart).build();
    final AppendableIndexBuilder indexBuilder;
    if (query.getContextValue("useOffheap", false)) {
        throw new UnsupportedOperationException("The 'useOffheap' option is no longer available for groupBy v1. Please move to the newer groupBy engine, " + "which always operates off-heap, by removing any custom 'druid.query.groupBy.defaultStrategy' runtime " + "properties and 'groupByStrategy' query context parameters that you have set.");
    } else {
        indexBuilder = new OnheapIncrementalIndex.Builder();
    }
    index = indexBuilder.setIndexSchema(indexSchema).setDeserializeComplexMetrics(false).setConcurrentEventAdd(true).setSortFacts(sortResults).setMaxRowCount(querySpecificConfig.getMaxResults()).build();
    Accumulator<IncrementalIndex, T> accumulator = new Accumulator<IncrementalIndex, T>() {

        @Override
        public IncrementalIndex accumulate(IncrementalIndex accumulated, T in) {
            final MapBasedRow mapBasedRow;
            if (in instanceof MapBasedRow) {
                mapBasedRow = (MapBasedRow) in;
            } else if (in instanceof ResultRow) {
                final ResultRow row = (ResultRow) in;
                mapBasedRow = row.toMapBasedRow(combine ? query : subquery);
            } else {
                throw new ISE("Unable to accumulate something of type [%s]", in.getClass());
            }
            try {
                accumulated.add(new MapBasedInputRow(mapBasedRow.getTimestamp(), dimensions, mapBasedRow.getEvent()));
            } catch (IndexSizeExceededException e) {
                throw new ResourceLimitExceededException(e.getMessage());
            }
            return accumulated;
        }
    };
    return new Pair<>(index, accumulator);
}
Also used : Accumulator(org.apache.druid.java.util.common.guava.Accumulator) DimensionSpec(org.apache.druid.query.dimension.DimensionSpec) AppendableIndexBuilder(org.apache.druid.segment.incremental.AppendableIndexBuilder) ArrayList(java.util.ArrayList) OnheapIncrementalIndex(org.apache.druid.segment.incremental.OnheapIncrementalIndex) Granularity(org.apache.druid.java.util.common.granularity.Granularity) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) DimensionSchema(org.apache.druid.data.input.impl.DimensionSchema) MapBasedRow(org.apache.druid.data.input.MapBasedRow) Function(com.google.common.base.Function) ISE(org.apache.druid.java.util.common.ISE) MapBasedInputRow(org.apache.druid.data.input.MapBasedInputRow) IncrementalIndexSchema(org.apache.druid.segment.incremental.IncrementalIndexSchema) Pair(org.apache.druid.java.util.common.Pair) IncrementalIndex(org.apache.druid.segment.incremental.IncrementalIndex) OnheapIncrementalIndex(org.apache.druid.segment.incremental.OnheapIncrementalIndex) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) ResourceLimitExceededException(org.apache.druid.query.ResourceLimitExceededException) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) IndexSizeExceededException(org.apache.druid.segment.incremental.IndexSizeExceededException)

Example 33 with StringDimensionSchema

use of org.apache.druid.data.input.impl.StringDimensionSchema in project druid by druid-io.

the class DimensionHandlerUtilsTest method testGetHandlerFromStringCapabilities.

@Test
public void testGetHandlerFromStringCapabilities() {
    ColumnCapabilities stringCapabilities = ColumnCapabilitiesImpl.createSimpleSingleValueStringColumnCapabilities().setHasBitmapIndexes(true).setDictionaryEncoded(true).setDictionaryValuesUnique(true).setDictionaryValuesUnique(true);
    DimensionHandler stringHandler = DimensionHandlerUtils.getHandlerFromCapabilities(DIM_NAME, stringCapabilities, DimensionSchema.MultiValueHandling.SORTED_SET);
    Assert.assertTrue(stringHandler instanceof StringDimensionHandler);
    Assert.assertTrue(stringHandler.getDimensionSchema(stringCapabilities) instanceof StringDimensionSchema);
}
Also used : ColumnCapabilities(org.apache.druid.segment.column.ColumnCapabilities) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 34 with StringDimensionSchema

use of org.apache.druid.data.input.impl.StringDimensionSchema in project druid by druid-io.

the class KafkaIndexTaskTest method testKafkaRecordEntityInputFormat.

@Test(timeout = 60_000L)
public void testKafkaRecordEntityInputFormat() throws Exception {
    // Insert data
    insertData(Iterables.limit(records, 3));
    final KafkaIndexTask task = createTask(null, new DataSchema("test_ds", new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim1"), new StringDimensionSchema("dim1t"), new StringDimensionSchema("dim2"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat"), new StringDimensionSchema("kafka.topic"), new LongDimensionSchema("kafka.offset"), new StringDimensionSchema("kafka.header.encoding"))), new AggregatorFactory[] { new DoubleSumAggregatorFactory("met1sum", "met1"), new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null), null), new KafkaIndexTaskIOConfig(0, "sequence0", new SeekableStreamStartSequenceNumbers<>(topic, ImmutableMap.of(0, 0L), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>(topic, ImmutableMap.of(0, 5L)), kafkaServer.consumerProperties(), KafkaSupervisorIOConfig.DEFAULT_POLL_TIMEOUT_MILLIS, true, null, null, new TestKafkaInputFormat(INPUT_FORMAT)));
    Assert.assertTrue(task.supportsQueries());
    final ListenableFuture<TaskStatus> future = runTask(task);
    while (countEvents(task) != 3) {
        Thread.sleep(25);
    }
    Assert.assertEquals(Status.READING, task.getRunner().getStatus());
    final QuerySegmentSpec interval = OBJECT_MAPPER.readValue("\"2008/2012\"", QuerySegmentSpec.class);
    List<ScanResultValue> scanResultValues = scanData(task, interval);
    // verify that there are no records indexed in the rollbacked time period
    Assert.assertEquals(3, Iterables.size(scanResultValues));
    int i = 0;
    for (ScanResultValue result : scanResultValues) {
        final Map<String, Object> event = ((List<Map<String, Object>>) result.getEvents()).get(0);
        Assert.assertEquals((long) i++, event.get("kafka.offset"));
        Assert.assertEquals(topic, event.get("kafka.topic"));
        Assert.assertEquals("application/json", event.get("kafka.header.encoding"));
    }
    // insert remaining data
    insertData(Iterables.skip(records, 3));
    // Wait for task to exit
    Assert.assertEquals(TaskState.SUCCESS, future.get().getStatusCode());
    // Check metrics
    Assert.assertEquals(4, task.getRunner().getRowIngestionMeters().getProcessed());
    Assert.assertEquals(0, task.getRunner().getRowIngestionMeters().getUnparseable());
    Assert.assertEquals(0, task.getRunner().getRowIngestionMeters().getThrownAway());
}
Also used : UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) QuerySegmentSpec(org.apache.druid.query.spec.QuerySegmentSpec) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) SeekableStreamEndSequenceNumbers(org.apache.druid.indexing.seekablestream.SeekableStreamEndSequenceNumbers) DoubleSumAggregatorFactory(org.apache.druid.query.aggregation.DoubleSumAggregatorFactory) LongDimensionSchema(org.apache.druid.data.input.impl.LongDimensionSchema) FloatDimensionSchema(org.apache.druid.data.input.impl.FloatDimensionSchema) DoubleSumAggregatorFactory(org.apache.druid.query.aggregation.DoubleSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) TaskStatus(org.apache.druid.indexer.TaskStatus) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) DataSchema(org.apache.druid.segment.indexing.DataSchema) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) ScanResultValue(org.apache.druid.query.scan.ScanResultValue) SeekableStreamStartSequenceNumbers(org.apache.druid.indexing.seekablestream.SeekableStreamStartSequenceNumbers) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Test(org.junit.Test) IndexTaskTest(org.apache.druid.indexing.common.task.IndexTaskTest)

Example 35 with StringDimensionSchema

use of org.apache.druid.data.input.impl.StringDimensionSchema in project druid by druid-io.

the class IndexGeneratorCombinerTest method testMultipleRowsNotMerged.

@Test
public void testMultipleRowsNotMerged() throws Exception {
    long timestamp = System.currentTimeMillis();
    Bucket bucket = new Bucket(0, DateTimes.utc(timestamp), 0);
    SortableBytes keySortableBytes = new SortableBytes(bucket.toGroupKey(), new byte[0]);
    BytesWritable key = keySortableBytes.toBytesWritable();
    InputRow row1 = new MapBasedInputRow(timestamp, ImmutableList.of("host", "keywords"), ImmutableMap.of("host", "host1", "keywords", Arrays.asList("foo", "bar"), "visited", 10));
    InputRow row2 = new MapBasedInputRow(timestamp, ImmutableList.of("host", "keywords"), ImmutableMap.of("host", "host2", "keywords", Arrays.asList("foo", "bar"), "visited", 5));
    DimensionsSpec dimensionsSpec = new DimensionsSpec(Arrays.asList(new StringDimensionSchema("host"), new StringDimensionSchema("keywords")));
    Map<String, InputRowSerde.IndexSerdeTypeHelper> typeHelperMap = InputRowSerde.getTypeHelperMap(dimensionsSpec);
    List<BytesWritable> rows = Lists.newArrayList(new BytesWritable(InputRowSerde.toBytes(typeHelperMap, row1, aggregators).getSerializedRow()), new BytesWritable(InputRowSerde.toBytes(typeHelperMap, row2, aggregators).getSerializedRow()));
    Reducer.Context context = EasyMock.createNiceMock(Reducer.Context.class);
    Capture<BytesWritable> captureKey1 = Capture.newInstance();
    Capture<BytesWritable> captureVal1 = Capture.newInstance();
    Capture<BytesWritable> captureKey2 = Capture.newInstance();
    Capture<BytesWritable> captureVal2 = Capture.newInstance();
    context.write(EasyMock.capture(captureKey1), EasyMock.capture(captureVal1));
    context.write(EasyMock.capture(captureKey2), EasyMock.capture(captureVal2));
    EasyMock.replay(context);
    combiner.reduce(key, rows, context);
    EasyMock.verify(context);
    Assert.assertTrue(captureKey1.getValue() == key);
    Assert.assertTrue(captureKey2.getValue() == key);
    InputRow capturedRow1 = InputRowSerde.fromBytes(typeHelperMap, captureVal1.getValue().getBytes(), aggregators);
    Assert.assertEquals(Arrays.asList("host", "keywords"), capturedRow1.getDimensions());
    Assert.assertEquals(Collections.singletonList("host1"), capturedRow1.getDimension("host"));
    Assert.assertEquals(Arrays.asList("bar", "foo"), capturedRow1.getDimension("keywords"));
    Assert.assertEquals(10, capturedRow1.getMetric("visited_sum").longValue());
    Assert.assertEquals(1.0, (Double) HyperUniquesAggregatorFactory.estimateCardinality(capturedRow1.getRaw("unique_hosts"), false), 0.001);
    InputRow capturedRow2 = InputRowSerde.fromBytes(typeHelperMap, captureVal2.getValue().getBytes(), aggregators);
    Assert.assertEquals(Arrays.asList("host", "keywords"), capturedRow2.getDimensions());
    Assert.assertEquals(Collections.singletonList("host2"), capturedRow2.getDimension("host"));
    Assert.assertEquals(Arrays.asList("bar", "foo"), capturedRow2.getDimension("keywords"));
    Assert.assertEquals(5, capturedRow2.getMetric("visited_sum").longValue());
    Assert.assertEquals(1.0, (Double) HyperUniquesAggregatorFactory.estimateCardinality(capturedRow2.getRaw("unique_hosts"), false), 0.001);
}
Also used : BytesWritable(org.apache.hadoop.io.BytesWritable) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) MapBasedInputRow(org.apache.druid.data.input.MapBasedInputRow) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) MapBasedInputRow(org.apache.druid.data.input.MapBasedInputRow) Reducer(org.apache.hadoop.mapreduce.Reducer) Test(org.junit.Test)

Aggregations

StringDimensionSchema (org.apache.druid.data.input.impl.StringDimensionSchema)36 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)30 Test (org.junit.Test)24 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)19 LongDimensionSchema (org.apache.druid.data.input.impl.LongDimensionSchema)15 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)15 FloatDimensionSchema (org.apache.druid.data.input.impl.FloatDimensionSchema)14 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)12 MapBasedInputRow (org.apache.druid.data.input.MapBasedInputRow)11 CountAggregatorFactory (org.apache.druid.query.aggregation.CountAggregatorFactory)11 File (java.io.File)8 ArrayList (java.util.ArrayList)8 Before (org.junit.Before)8 ImmutableList (com.google.common.collect.ImmutableList)7 HashMap (java.util.HashMap)7 DataSchema (org.apache.druid.segment.indexing.DataSchema)7 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)7 List (java.util.List)6 SupervisorStateManagerConfig (org.apache.druid.indexing.overlord.supervisor.SupervisorStateManagerConfig)6 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)5