Examples with DataSchema - org.apache.druid.segment.indexing.DataSchema

Example 31 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class IndexTaskSamplerSpecTest method testSerde.

@Test
public void testSerde() throws IOException {
    String json = "{\n" + "  \"type\": \"index\",\n" + "  \"samplerConfig\": {\n" + "    \"numRows\": 123,\n" + "    \"timeoutMs\": 2345\n" + "  },\n" + "  \"spec\": {\n" + "    \"dataSchema\": {\n" + "      \"dataSource\": \"sampler\",\n" + "      \"dimensionsSpec\": {},\n" + "      \"timestampSpec\": {\n" + "        \"missingValue\": \"1970\"\n" + "      }\n" + "    },\n" + "    \"ioConfig\": {\n" + "      \"type\": \"index\",\n" + "      \"inputSource\": {\n" + "        \"type\": \"local\",\n" + "        \"baseDir\": \"/tmp\",\n" + "        \"filter\": \"wikiticker-2015-09-12-sampled.json\"\n" + "      },\n" + "      \"inputFormat\": {\n" + "        \"type\": \"json\"\n" + "      }\n" + "    }\n" + "  }\n" + "}";
    Capture<InputSource> capturedInputSource = EasyMock.newCapture();
    Capture<InputFormat> capturedInputFormat = EasyMock.newCapture();
    Capture<DataSchema> capturedDataSchema = EasyMock.newCapture();
    Capture<SamplerConfig> capturedSamplerConfig = EasyMock.newCapture();
    IndexTaskSamplerSpec spec = MAPPER.readValue(json, IndexTaskSamplerSpec.class);
    EasyMock.expect(inputSourceSampler.sample(EasyMock.capture(capturedInputSource), EasyMock.capture(capturedInputFormat), EasyMock.capture(capturedDataSchema), EasyMock.capture(capturedSamplerConfig))).andReturn(new SamplerResponse(0, 0, null));
    replayAll();
    spec.sample();
    verifyAll();
    InputSource inputSource = capturedInputSource.getValue();
    Assert.assertEquals(new File("/tmp"), ((LocalInputSource) inputSource).getBaseDir());
    Assert.assertEquals("wikiticker-2015-09-12-sampled.json", ((LocalInputSource) inputSource).getFilter());
    DataSchema dataSchema = capturedDataSchema.getValue();
    Assert.assertEquals("sampler", dataSchema.getDataSource());
    Assert.assertEquals(JsonInputFormat.class, capturedInputFormat.getValue().getClass());
    SamplerConfig samplerConfig = capturedSamplerConfig.getValue();
    Assert.assertEquals(123, samplerConfig.getNumRows());
    Assert.assertEquals(2345, samplerConfig.getTimeoutMs());
}

Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) LocalInputSource(org.apache.druid.data.input.impl.LocalInputSource) InputSource(org.apache.druid.data.input.InputSource) InputFormat(org.apache.druid.data.input.InputFormat) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) SamplerResponse(org.apache.druid.client.indexing.SamplerResponse) File(java.io.File) Test(org.junit.Test)

Example 32 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class InputSourceSamplerTest method testWithTransformsAutoDimensions.

@Test
public void testWithTransformsAutoDimensions() throws IOException {
    final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
    final DimensionsSpec dimensionsSpec = new DimensionsSpec(null);
    final TransformSpec transformSpec = new TransformSpec(null, ImmutableList.of(new ExpressionTransform("dim1PlusBar", "concat(dim1, 'bar')", TestExprMacroTable.INSTANCE)));
    final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
    final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, true, null);
    final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, transformSpec);
    final InputSource inputSource = createInputSource(getTestRows(), dataSchema);
    final InputFormat inputFormat = createInputFormat();
    SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
    Assert.assertEquals(6, response.getNumRowsRead());
    Assert.assertEquals(5, response.getNumRowsIndexed());
    Assert.assertEquals(4, response.getData().size());
    List<SamplerResponseRow> data = response.getData();
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", null).put("met1", 6L).build(), null, null), data.get(0));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo2").put("dim2", null).put("met1", 4L).build(), null, null), data.get(1));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(4), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", "bar").put("met1", 5L).build(), null, null), data.get(2));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(3));
}

Also used : RecordSupplierInputSource(org.apache.druid.indexing.seekablestream.RecordSupplierInputSource) InlineInputSource(org.apache.druid.data.input.impl.InlineInputSource) InputSource(org.apache.druid.data.input.InputSource) SamplerResponse(org.apache.druid.client.indexing.SamplerResponse) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) TransformSpec(org.apache.druid.segment.transform.TransformSpec) DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) InputFormat(org.apache.druid.data.input.InputFormat) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) ExpressionTransform(org.apache.druid.segment.transform.ExpressionTransform) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 33 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class InputSourceSamplerTest method testWithNoRollup.

@Test
public void testWithNoRollup() throws IOException {
    final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
    final DimensionsSpec dimensionsSpec = new DimensionsSpec(null);
    final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
    final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, false, null);
    final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, null);
    final InputSource inputSource = createInputSource(getTestRows(), dataSchema);
    final InputFormat inputFormat = createInputFormat();
    SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
    Assert.assertEquals(6, response.getNumRowsRead());
    Assert.assertEquals(5, response.getNumRowsIndexed());
    Assert.assertEquals(6, response.getData().size());
    List<SamplerResponseRow> data = response.getData();
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", null).put("met1", 1L).build(), null, null), data.get(0));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(1), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", null).put("met1", 2L).build(), null, null), data.get(1));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(2), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", null).put("met1", 3L).build(), null, null), data.get(2));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo2").put("dim2", null).put("met1", 4L).build(), null, null), data.get(3));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(4), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", "bar").put("met1", 5L).build(), null, null), data.get(4));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(5));
}

Also used : RecordSupplierInputSource(org.apache.druid.indexing.seekablestream.RecordSupplierInputSource) InlineInputSource(org.apache.druid.data.input.impl.InlineInputSource) InputSource(org.apache.druid.data.input.InputSource) SamplerResponse(org.apache.druid.client.indexing.SamplerResponse) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) InputFormat(org.apache.druid.data.input.InputFormat) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 34 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class MaterializedViewSupervisorSpec method createTask.

public HadoopIndexTask createTask(Interval interval, String version, List<DataSegment> segments) {
    String taskId = StringUtils.format("%s_%s_%s", TASK_PREFIX, dataSourceName, DateTimes.nowUtc());
    // generate parser
    Map<String, Object> parseSpec = new HashMap<>();
    parseSpec.put("format", "timeAndDims");
    parseSpec.put("dimensionsSpec", dimensionsSpec);
    Map<String, Object> parser = new HashMap<>();
    parser.put("type", "map");
    parser.put("parseSpec", parseSpec);
    // generate HadoopTuningConfig
    HadoopTuningConfig tuningConfigForTask = new HadoopTuningConfig(tuningConfig.getWorkingPath(), version, tuningConfig.getPartitionsSpec(), tuningConfig.getShardSpecs(), tuningConfig.getIndexSpec(), tuningConfig.getIndexSpecForIntermediatePersists(), tuningConfig.getAppendableIndexSpec(), tuningConfig.getMaxRowsInMemory(), tuningConfig.getMaxBytesInMemory(), tuningConfig.isLeaveIntermediate(), tuningConfig.isCleanupOnFailure(), tuningConfig.isOverwriteFiles(), tuningConfig.isIgnoreInvalidRows(), tuningConfig.getJobProperties(), tuningConfig.isCombineText(), tuningConfig.getUseCombiner(), tuningConfig.getMaxRowsInMemory(), tuningConfig.getNumBackgroundPersistThreads(), tuningConfig.isForceExtendableShardSpecs(), true, tuningConfig.getUserAllowedHadoopPrefix(), tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.isUseYarnRMJobStatusFallback(), tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis());
    // generate granularity
    ArbitraryGranularitySpec granularitySpec = new ArbitraryGranularitySpec(Granularities.NONE, ImmutableList.of(interval));
    // generate DataSchema
    DataSchema dataSchema = new DataSchema(dataSourceName, parser, aggregators, granularitySpec, TransformSpec.NONE, objectMapper);
    // generate DatasourceIngestionSpec
    DatasourceIngestionSpec datasourceIngestionSpec = new DatasourceIngestionSpec(baseDataSource, null, ImmutableList.of(interval), segments, null, null, null, false, null);
    // generate HadoopIOConfig
    Map<String, Object> inputSpec = new HashMap<>();
    inputSpec.put("type", "dataSource");
    inputSpec.put("ingestionSpec", datasourceIngestionSpec);
    HadoopIOConfig hadoopIOConfig = new HadoopIOConfig(inputSpec, null, null);
    // generate HadoopIngestionSpec
    HadoopIngestionSpec spec = new HadoopIngestionSpec(dataSchema, hadoopIOConfig, tuningConfigForTask);
    // generate HadoopIndexTask
    HadoopIndexTask task = new HadoopIndexTask(taskId, spec, hadoopCoordinates, hadoopDependencyCoordinates, classpathPrefix, objectMapper, context, authorizerMapper, chatHandlerProvider);
    return task;
}

Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) DatasourceIngestionSpec(org.apache.druid.indexer.hadoop.DatasourceIngestionSpec) HadoopIngestionSpec(org.apache.druid.indexer.HadoopIngestionSpec) HashMap(java.util.HashMap) HadoopTuningConfig(org.apache.druid.indexer.HadoopTuningConfig) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) HadoopIndexTask(org.apache.druid.indexing.common.task.HadoopIndexTask) HadoopIOConfig(org.apache.druid.indexer.HadoopIOConfig)

Example 35 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class KafkaIndexTaskTest method testKafkaInputFormat.

@Test(timeout = 60_000L)
public void testKafkaInputFormat() throws Exception {
    // Insert data
    insertData(Iterables.limit(records, 3));
    final KafkaIndexTask task = createTask(null, new DataSchema("test_ds", new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim1"), new StringDimensionSchema("dim1t"), new StringDimensionSchema("dim2"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat"), new StringDimensionSchema("kafka.testheader.encoding"))), new AggregatorFactory[] { new DoubleSumAggregatorFactory("met1sum", "met1"), new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null), null), new KafkaIndexTaskIOConfig(0, "sequence0", new SeekableStreamStartSequenceNumbers<>(topic, ImmutableMap.of(0, 0L), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>(topic, ImmutableMap.of(0, 5L)), kafkaServer.consumerProperties(), KafkaSupervisorIOConfig.DEFAULT_POLL_TIMEOUT_MILLIS, true, null, null, KAFKA_INPUT_FORMAT));
    Assert.assertTrue(task.supportsQueries());
    final ListenableFuture<TaskStatus> future = runTask(task);
    while (countEvents(task) != 3) {
        Thread.sleep(25);
    }
    Assert.assertEquals(Status.READING, task.getRunner().getStatus());
    final QuerySegmentSpec interval = OBJECT_MAPPER.readValue("\"2008/2012\"", QuerySegmentSpec.class);
    List<ScanResultValue> scanResultValues = scanData(task, interval);
    // verify that there are no records indexed in the rollbacked time period
    Assert.assertEquals(3, Iterables.size(scanResultValues));
    int i = 0;
    for (ScanResultValue result : scanResultValues) {
        final Map<String, Object> event = ((List<Map<String, Object>>) result.getEvents()).get(0);
        Assert.assertEquals("application/json", event.get("kafka.testheader.encoding"));
        Assert.assertEquals("y", event.get("dim2"));
    }
    // insert remaining data
    insertData(Iterables.skip(records, 3));
    // Wait for task to exit
    Assert.assertEquals(TaskState.SUCCESS, future.get().getStatusCode());
    // Check metrics
    Assert.assertEquals(4, task.getRunner().getRowIngestionMeters().getProcessed());
    Assert.assertEquals(0, task.getRunner().getRowIngestionMeters().getUnparseable());
    Assert.assertEquals(0, task.getRunner().getRowIngestionMeters().getThrownAway());
}

Also used : DoubleSumAggregatorFactory(org.apache.druid.query.aggregation.DoubleSumAggregatorFactory) LongDimensionSchema(org.apache.druid.data.input.impl.LongDimensionSchema) FloatDimensionSchema(org.apache.druid.data.input.impl.FloatDimensionSchema) DoubleSumAggregatorFactory(org.apache.druid.query.aggregation.DoubleSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) TaskStatus(org.apache.druid.indexer.TaskStatus) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) ScanResultValue(org.apache.druid.query.scan.ScanResultValue) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) SeekableStreamStartSequenceNumbers(org.apache.druid.indexing.seekablestream.SeekableStreamStartSequenceNumbers) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) QuerySegmentSpec(org.apache.druid.query.spec.QuerySegmentSpec) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) SeekableStreamEndSequenceNumbers(org.apache.druid.indexing.seekablestream.SeekableStreamEndSequenceNumbers) Test(org.junit.Test) IndexTaskTest(org.apache.druid.indexing.common.task.IndexTaskTest)

Aggregations

DataSchema (org.apache.druid.segment.indexing.DataSchema)80 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)49 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)45 Test (org.junit.Test)44 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)32 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)25 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)22 GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)19 InputSource (org.apache.druid.data.input.InputSource)17 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)17 File (java.io.File)16 Map (java.util.Map)15 InputFormat (org.apache.druid.data.input.InputFormat)15 CountAggregatorFactory (org.apache.druid.query.aggregation.CountAggregatorFactory)15 SamplerResponse (org.apache.druid.client.indexing.SamplerResponse)14 SamplerResponseRow (org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow)13 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)13 Interval (org.joda.time.Interval)13 ArrayList (java.util.ArrayList)12 JsonInputFormat (org.apache.druid.data.input.impl.JsonInputFormat)12