Examples with CSVParseSpec - org.apache.druid.data.input.impl.CSVParseSpec

Example 11 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class IndexTaskTest method testMultipleParseExceptionsFailure.

@Test
public void testMultipleParseExceptionsFailure() throws Exception {
    final File tmpDir = temporaryFolder.newFolder();
    final File tmpFile = File.createTempFile("druid", "index", tmpDir);
    try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
        writer.write("time,dim,dimLong,dimFloat,val\n");
        // unparseable
        writer.write("unparseable,a,2,3.0,1\n");
        // valid row
        writer.write("2014-01-01T00:00:10Z,a,2,3.0,1\n");
        // unparseable
        writer.write("9.0,a,2,3.0,1\n");
        // thrown away
        writer.write("3014-03-01T00:00:10Z,outsideofinterval,2,3.0,1\n");
        // unparseable
        writer.write("99999999999-01-01T00:00:10Z,b,2,3.0,1\n");
    }
    // Allow up to 3 parse exceptions, and save up to 2 parse exceptions
    final IndexTuningConfig tuningConfig = new IndexTuningConfig(null, null, null, null, null, null, null, null, null, null, new DynamicPartitionsSpec(2, null), INDEX_SPEC, null, null, false, false, null, null, null, true, 2, 5, null, null);
    final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
    final DimensionsSpec dimensionsSpec = new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat")));
    final List<String> columns = Arrays.asList("time", "dim", "dimLong", "dimFloat", "val");
    final IndexIngestionSpec ingestionSpec;
    List<String> expectedMessages;
    if (useInputFormatApi) {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, dimensionsSpec, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
        expectedMessages = Arrays.asList(StringUtils.format("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 3, Line: 6)", tmpFile.toURI()), StringUtils.format("Timestamp[9.0] is unparseable! Event: {time=9.0, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 2, Line: 4)", tmpFile.toURI()), StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 1, Line: 2)", tmpFile.toURI()));
    } else {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, dimensionsSpec, null, columns, true, 0), null, null, tuningConfig, false, false);
        expectedMessages = Arrays.asList("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "Timestamp[9.0] is unparseable! Event: {time=9.0, dim=a, dimLong=2, dimFloat=3.0, val=1}", "Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
    }
    IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
    TaskStatus status = runTask(indexTask).lhs;
    Assert.assertEquals(TaskState.FAILED, status.getStatusCode());
    checkTaskStatusErrorMsgForParseExceptionsExceeded(status);
    IngestionStatsAndErrorsTaskReportData reportData = getTaskReportData();
    Map<String, Object> expectedMetrics = ImmutableMap.of(RowIngestionMeters.DETERMINE_PARTITIONS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 0, RowIngestionMeters.PROCESSED, 0, RowIngestionMeters.UNPARSEABLE, 0, RowIngestionMeters.THROWN_AWAY, 0), RowIngestionMeters.BUILD_SEGMENTS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 0, RowIngestionMeters.PROCESSED, 1, RowIngestionMeters.UNPARSEABLE, 3, RowIngestionMeters.THROWN_AWAY, useInputFormatApi ? 1 : 2));
    Assert.assertEquals(expectedMetrics, reportData.getRowStats());
    List<LinkedHashMap> parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.BUILD_SEGMENTS);
    List<String> actualMessages = parseExceptionReports.stream().map((r) -> {
        return ((List<String>) r.get("details")).get(0);
    }).collect(Collectors.toList());
    Assert.assertEquals(expectedMessages, actualMessages);
    List<String> expectedInputs = Arrays.asList("{time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "{time=9.0, dim=a, dimLong=2, dimFloat=3.0, val=1}", "{time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
    List<String> actualInputs = parseExceptionReports.stream().map((r) -> {
        return (String) r.get("input");
    }).collect(Collectors.toList());
    Assert.assertEquals(expectedInputs, actualInputs);
}

Also used : TaskReport(org.apache.druid.indexing.common.TaskReport) TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) Arrays(java.util.Arrays) IndexSpec(org.apache.druid.segment.IndexSpec) Pair(org.apache.druid.java.util.common.Pair) Map(java.util.Map) ExpressionTransform(org.apache.druid.segment.transform.ExpressionTransform) AppenderatorsManager(org.apache.druid.segment.realtime.appenderator.AppenderatorsManager) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) IAE(org.apache.druid.java.util.common.IAE) InputFormat(org.apache.druid.data.input.InputFormat) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) Set(java.util.Set) NoopSegmentHandoffNotifierFactory(org.apache.druid.segment.realtime.plumber.NoopSegmentHandoffNotifierFactory) EqualsVerifier(nl.jqno.equalsverifier.EqualsVerifier) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) StandardCharsets(java.nio.charset.StandardCharsets) TaskState(org.apache.druid.indexer.TaskState) CountDownLatch(java.util.concurrent.CountDownLatch) PartitionIds(org.apache.druid.timeline.partition.PartitionIds) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) RowIngestionMetersFactory(org.apache.druid.segment.incremental.RowIngestionMetersFactory) SegmentLocalCacheManager(org.apache.druid.segment.loading.SegmentLocalCacheManager) SegmentId(org.apache.druid.timeline.SegmentId) TransformSpec(org.apache.druid.segment.transform.TransformSpec) Granularity(org.apache.druid.java.util.common.granularity.Granularity) SegmentLoaderConfig(org.apache.druid.segment.loading.SegmentLoaderConfig) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) RunWith(org.junit.runner.RunWith) TaskStatus(org.apache.druid.indexer.TaskStatus) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Interval(org.joda.time.Interval) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) Nullable(javax.annotation.Nullable) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) Before(org.junit.Before) BufferedWriter(java.io.BufferedWriter) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Test(org.junit.Test) IOException(java.io.IOException) EasyMock(org.easymock.EasyMock) File(java.io.File) Preconditions(com.google.common.base.Preconditions) Assert(org.junit.Assert) DataSchema(org.apache.druid.segment.indexing.DataSchema) CoreMatchers(org.hamcrest.CoreMatchers) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) IndexIOConfig(org.apache.druid.indexing.common.task.IndexTask.IndexIOConfig) LocalInputSource(org.apache.druid.data.input.impl.LocalInputSource) LongDimensionSchema(org.apache.druid.data.input.impl.LongDimensionSchema) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DefaultDimensionSpec(org.apache.druid.query.dimension.DefaultDimensionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) SelectorDimFilter(org.apache.druid.query.filter.SelectorDimFilter) Event(org.apache.druid.java.util.emitter.core.Event) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) TypeReference(com.fasterxml.jackson.core.type.TypeReference) Parameterized(org.junit.runners.Parameterized) ParseSpec(org.apache.druid.data.input.impl.ParseSpec) Sequence(org.apache.druid.java.util.common.guava.Sequence) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) LocalFirehoseFactory(org.apache.druid.segment.realtime.firehose.LocalFirehoseFactory) ImmutableMap(com.google.common.collect.ImmutableMap) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) StringUtils(org.apache.druid.java.util.common.StringUtils) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) LockGranularity(org.apache.druid.indexing.common.LockGranularity) ExprMacroTable(org.apache.druid.math.expr.ExprMacroTable) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) List(java.util.List) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) ServiceEmitter(org.apache.druid.java.util.emitter.service.ServiceEmitter) DataSegment(org.apache.druid.timeline.DataSegment) SegmentHandoffNotifierFactory(org.apache.druid.segment.handoff.SegmentHandoffNotifierFactory) SegmentAllocateAction(org.apache.druid.indexing.common.actions.SegmentAllocateAction) Intervals(org.apache.druid.java.util.common.Intervals) HashMap(java.util.HashMap) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) FloatDimensionSchema(org.apache.druid.data.input.impl.FloatDimensionSchema) Files(com.google.common.io.Files) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) DimensionSelector(org.apache.druid.segment.DimensionSelector) ExpectedException(org.junit.rules.ExpectedException) SegmentHandoffNotifier(org.apache.druid.segment.handoff.SegmentHandoffNotifier) NoopServiceEmitter(org.apache.druid.server.metrics.NoopServiceEmitter) QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) VirtualColumns(org.apache.druid.segment.VirtualColumns) WindowedStorageAdapter(org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JSONParseSpec(org.apache.druid.data.input.impl.JSONParseSpec) StorageLocationConfig(org.apache.druid.segment.loading.StorageLocationConfig) Granularities(org.apache.druid.java.util.common.granularity.Granularities) TimeUnit(java.util.concurrent.TimeUnit) Rule(org.junit.Rule) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) Cursor(org.apache.druid.segment.Cursor) SegmentCacheManager(org.apache.druid.segment.loading.SegmentCacheManager) IndexIO(org.apache.druid.segment.IndexIO) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) LongDimensionSchema(org.apache.druid.data.input.impl.LongDimensionSchema) FloatDimensionSchema(org.apache.druid.data.input.impl.FloatDimensionSchema) TaskStatus(org.apache.druid.indexer.TaskStatus) BufferedWriter(java.io.BufferedWriter) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) LinkedHashMap(java.util.LinkedHashMap) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) File(java.io.File) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) Test(org.junit.Test)

Example 12 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class IndexTaskTest method testReportParseException.

@Test
public void testReportParseException() throws Exception {
    final File tmpDir = temporaryFolder.newFolder();
    final File tmpFile = File.createTempFile("druid", "index", tmpDir);
    try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
        writer.write("time,d,val\n");
        writer.write("unparseable,a,1\n");
        writer.write("2014-01-01T00:00:10Z,a,1\n");
    }
    final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
    final List<String> columns = Arrays.asList("time", "dim", "val");
    // report parse exception
    final IndexTuningConfig tuningConfig = createTuningConfig(2, null, null, null, null, false, true);
    final IndexIngestionSpec indexIngestionSpec;
    List<String> expectedMessages;
    if (useInputFormatApi) {
        indexIngestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, DimensionsSpec.EMPTY, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
        expectedMessages = ImmutableList.of(StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, d=a, val=1} (Path: %s, Record: 1, Line: 2)", tmpFile.toURI()));
    } else {
        indexIngestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, DimensionsSpec.EMPTY, null, columns, true, 0), null, null, tuningConfig, false, false);
        expectedMessages = ImmutableList.of("Timestamp[unparseable] is unparseable! Event: {time=unparseable, d=a, val=1}");
    }
    IndexTask indexTask = new IndexTask(null, null, indexIngestionSpec, null);
    TaskStatus status = runTask(indexTask).lhs;
    Assert.assertEquals(TaskState.FAILED, status.getStatusCode());
    checkTaskStatusErrorMsgForParseExceptionsExceeded(status);
    IngestionStatsAndErrorsTaskReportData reportData = getTaskReportData();
    List<LinkedHashMap> parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.BUILD_SEGMENTS);
    List<String> actualMessages = parseExceptionReports.stream().map((r) -> {
        return ((List<String>) r.get("details")).get(0);
    }).collect(Collectors.toList());
    Assert.assertEquals(expectedMessages, actualMessages);
    List<String> expectedInputs = ImmutableList.of("{time=unparseable, d=a, val=1}");
    List<String> actualInputs = parseExceptionReports.stream().map((r) -> {
        return (String) r.get("input");
    }).collect(Collectors.toList());
    Assert.assertEquals(expectedInputs, actualInputs);
}

Also used : TaskReport(org.apache.druid.indexing.common.TaskReport) TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) Arrays(java.util.Arrays) IndexSpec(org.apache.druid.segment.IndexSpec) Pair(org.apache.druid.java.util.common.Pair) Map(java.util.Map) ExpressionTransform(org.apache.druid.segment.transform.ExpressionTransform) AppenderatorsManager(org.apache.druid.segment.realtime.appenderator.AppenderatorsManager) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) IAE(org.apache.druid.java.util.common.IAE) InputFormat(org.apache.druid.data.input.InputFormat) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) Set(java.util.Set) NoopSegmentHandoffNotifierFactory(org.apache.druid.segment.realtime.plumber.NoopSegmentHandoffNotifierFactory) EqualsVerifier(nl.jqno.equalsverifier.EqualsVerifier) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) StandardCharsets(java.nio.charset.StandardCharsets) TaskState(org.apache.druid.indexer.TaskState) CountDownLatch(java.util.concurrent.CountDownLatch) PartitionIds(org.apache.druid.timeline.partition.PartitionIds) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) RowIngestionMetersFactory(org.apache.druid.segment.incremental.RowIngestionMetersFactory) SegmentLocalCacheManager(org.apache.druid.segment.loading.SegmentLocalCacheManager) SegmentId(org.apache.druid.timeline.SegmentId) TransformSpec(org.apache.druid.segment.transform.TransformSpec) Granularity(org.apache.druid.java.util.common.granularity.Granularity) SegmentLoaderConfig(org.apache.druid.segment.loading.SegmentLoaderConfig) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) RunWith(org.junit.runner.RunWith) TaskStatus(org.apache.druid.indexer.TaskStatus) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Interval(org.joda.time.Interval) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) Nullable(javax.annotation.Nullable) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) Before(org.junit.Before) BufferedWriter(java.io.BufferedWriter) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Test(org.junit.Test) IOException(java.io.IOException) EasyMock(org.easymock.EasyMock) File(java.io.File) Preconditions(com.google.common.base.Preconditions) Assert(org.junit.Assert) DataSchema(org.apache.druid.segment.indexing.DataSchema) CoreMatchers(org.hamcrest.CoreMatchers) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) IndexIOConfig(org.apache.druid.indexing.common.task.IndexTask.IndexIOConfig) LocalInputSource(org.apache.druid.data.input.impl.LocalInputSource) LongDimensionSchema(org.apache.druid.data.input.impl.LongDimensionSchema) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DefaultDimensionSpec(org.apache.druid.query.dimension.DefaultDimensionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) SelectorDimFilter(org.apache.druid.query.filter.SelectorDimFilter) Event(org.apache.druid.java.util.emitter.core.Event) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) TypeReference(com.fasterxml.jackson.core.type.TypeReference) Parameterized(org.junit.runners.Parameterized) ParseSpec(org.apache.druid.data.input.impl.ParseSpec) Sequence(org.apache.druid.java.util.common.guava.Sequence) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) LocalFirehoseFactory(org.apache.druid.segment.realtime.firehose.LocalFirehoseFactory) ImmutableMap(com.google.common.collect.ImmutableMap) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) StringUtils(org.apache.druid.java.util.common.StringUtils) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) LockGranularity(org.apache.druid.indexing.common.LockGranularity) ExprMacroTable(org.apache.druid.math.expr.ExprMacroTable) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) List(java.util.List) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) ServiceEmitter(org.apache.druid.java.util.emitter.service.ServiceEmitter) DataSegment(org.apache.druid.timeline.DataSegment) SegmentHandoffNotifierFactory(org.apache.druid.segment.handoff.SegmentHandoffNotifierFactory) SegmentAllocateAction(org.apache.druid.indexing.common.actions.SegmentAllocateAction) Intervals(org.apache.druid.java.util.common.Intervals) HashMap(java.util.HashMap) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) FloatDimensionSchema(org.apache.druid.data.input.impl.FloatDimensionSchema) Files(com.google.common.io.Files) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) DimensionSelector(org.apache.druid.segment.DimensionSelector) ExpectedException(org.junit.rules.ExpectedException) SegmentHandoffNotifier(org.apache.druid.segment.handoff.SegmentHandoffNotifier) NoopServiceEmitter(org.apache.druid.server.metrics.NoopServiceEmitter) QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) VirtualColumns(org.apache.druid.segment.VirtualColumns) WindowedStorageAdapter(org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JSONParseSpec(org.apache.druid.data.input.impl.JSONParseSpec) StorageLocationConfig(org.apache.druid.segment.loading.StorageLocationConfig) Granularities(org.apache.druid.java.util.common.granularity.Granularities) TimeUnit(java.util.concurrent.TimeUnit) Rule(org.junit.Rule) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) Cursor(org.apache.druid.segment.Cursor) SegmentCacheManager(org.apache.druid.segment.loading.SegmentCacheManager) IndexIO(org.apache.druid.segment.IndexIO) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) TaskStatus(org.apache.druid.indexer.TaskStatus) BufferedWriter(java.io.BufferedWriter) LinkedHashMap(java.util.LinkedHashMap) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) File(java.io.File) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) Test(org.junit.Test)

Example 13 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class IndexTaskTest method testMultipleParseExceptionsFailureAtDeterminePartitions.

@Test
public void testMultipleParseExceptionsFailureAtDeterminePartitions() throws Exception {
    final File tmpDir = temporaryFolder.newFolder();
    final File tmpFile = File.createTempFile("druid", "index", tmpDir);
    try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
        writer.write("time,dim,dimLong,dimFloat,val\n");
        // unparseable
        writer.write("unparseable,a,2,3.0,1\n");
        // valid row
        writer.write("2014-01-01T00:00:10Z,a,2,3.0,1\n");
        // unparseable
        writer.write("9.0,a,2,3.0,1\n");
        // thrown away
        writer.write("3014-03-01T00:00:10Z,outsideofinterval,2,3.0,1\n");
        // unparseable
        writer.write("99999999999-01-01T00:00:10Z,b,2,3.0,1\n");
    }
    // Allow up to 3 parse exceptions, and save up to 2 parse exceptions
    final IndexTuningConfig tuningConfig = new IndexTuningConfig(null, null, null, null, null, null, null, null, null, null, new HashedPartitionsSpec(2, null, null), INDEX_SPEC, null, null, true, false, null, null, null, true, 2, 5, null, null);
    final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
    final DimensionsSpec dimensionsSpec = new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat")));
    final List<String> columns = Arrays.asList("time", "dim", "dimLong", "dimFloat", "val");
    final IndexIngestionSpec ingestionSpec;
    List<String> expectedMessages;
    if (useInputFormatApi) {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, dimensionsSpec, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
        expectedMessages = Arrays.asList(StringUtils.format("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 3, Line: 6)", tmpFile.toURI()), StringUtils.format("Timestamp[9.0] is unparseable! Event: {time=9.0, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 2, Line: 4)", tmpFile.toURI()), StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 1, Line: 2)", tmpFile.toURI()));
    } else {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, dimensionsSpec, null, columns, true, 0), null, null, tuningConfig, false, false);
        expectedMessages = Arrays.asList("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "Timestamp[9.0] is unparseable! Event: {time=9.0, dim=a, dimLong=2, dimFloat=3.0, val=1}", "Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
    }
    IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
    TaskStatus status = runTask(indexTask).lhs;
    Assert.assertEquals(TaskState.FAILED, status.getStatusCode());
    checkTaskStatusErrorMsgForParseExceptionsExceeded(status);
    IngestionStatsAndErrorsTaskReportData reportData = getTaskReportData();
    Map<String, Object> expectedMetrics = ImmutableMap.of(RowIngestionMeters.DETERMINE_PARTITIONS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 0, RowIngestionMeters.PROCESSED, 1, RowIngestionMeters.UNPARSEABLE, 3, RowIngestionMeters.THROWN_AWAY, useInputFormatApi ? 1 : 2), RowIngestionMeters.BUILD_SEGMENTS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 0, RowIngestionMeters.PROCESSED, 0, RowIngestionMeters.UNPARSEABLE, 0, RowIngestionMeters.THROWN_AWAY, 0));
    Assert.assertEquals(expectedMetrics, reportData.getRowStats());
    List<LinkedHashMap> parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.DETERMINE_PARTITIONS);
    List<String> actualMessages = parseExceptionReports.stream().map((r) -> {
        return ((List<String>) r.get("details")).get(0);
    }).collect(Collectors.toList());
    Assert.assertEquals(expectedMessages, actualMessages);
    List<String> expectedInputs = Arrays.asList("{time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "{time=9.0, dim=a, dimLong=2, dimFloat=3.0, val=1}", "{time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
    List<String> actualInputs = parseExceptionReports.stream().map((r) -> {
        return (String) r.get("input");
    }).collect(Collectors.toList());
    Assert.assertEquals(expectedInputs, actualInputs);
}

Also used : TaskReport(org.apache.druid.indexing.common.TaskReport) TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) Arrays(java.util.Arrays) IndexSpec(org.apache.druid.segment.IndexSpec) Pair(org.apache.druid.java.util.common.Pair) Map(java.util.Map) ExpressionTransform(org.apache.druid.segment.transform.ExpressionTransform) AppenderatorsManager(org.apache.druid.segment.realtime.appenderator.AppenderatorsManager) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) IAE(org.apache.druid.java.util.common.IAE) InputFormat(org.apache.druid.data.input.InputFormat) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) Set(java.util.Set) NoopSegmentHandoffNotifierFactory(org.apache.druid.segment.realtime.plumber.NoopSegmentHandoffNotifierFactory) EqualsVerifier(nl.jqno.equalsverifier.EqualsVerifier) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) StandardCharsets(java.nio.charset.StandardCharsets) TaskState(org.apache.druid.indexer.TaskState) CountDownLatch(java.util.concurrent.CountDownLatch) PartitionIds(org.apache.druid.timeline.partition.PartitionIds) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) RowIngestionMetersFactory(org.apache.druid.segment.incremental.RowIngestionMetersFactory) SegmentLocalCacheManager(org.apache.druid.segment.loading.SegmentLocalCacheManager) SegmentId(org.apache.druid.timeline.SegmentId) TransformSpec(org.apache.druid.segment.transform.TransformSpec) Granularity(org.apache.druid.java.util.common.granularity.Granularity) SegmentLoaderConfig(org.apache.druid.segment.loading.SegmentLoaderConfig) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) RunWith(org.junit.runner.RunWith) TaskStatus(org.apache.druid.indexer.TaskStatus) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Interval(org.joda.time.Interval) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) Nullable(javax.annotation.Nullable) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) Before(org.junit.Before) BufferedWriter(java.io.BufferedWriter) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Test(org.junit.Test) IOException(java.io.IOException) EasyMock(org.easymock.EasyMock) File(java.io.File) Preconditions(com.google.common.base.Preconditions) Assert(org.junit.Assert) DataSchema(org.apache.druid.segment.indexing.DataSchema) CoreMatchers(org.hamcrest.CoreMatchers) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) IndexIOConfig(org.apache.druid.indexing.common.task.IndexTask.IndexIOConfig) LocalInputSource(org.apache.druid.data.input.impl.LocalInputSource) LongDimensionSchema(org.apache.druid.data.input.impl.LongDimensionSchema) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DefaultDimensionSpec(org.apache.druid.query.dimension.DefaultDimensionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) SelectorDimFilter(org.apache.druid.query.filter.SelectorDimFilter) Event(org.apache.druid.java.util.emitter.core.Event) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) TypeReference(com.fasterxml.jackson.core.type.TypeReference) Parameterized(org.junit.runners.Parameterized) ParseSpec(org.apache.druid.data.input.impl.ParseSpec) Sequence(org.apache.druid.java.util.common.guava.Sequence) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) LocalFirehoseFactory(org.apache.druid.segment.realtime.firehose.LocalFirehoseFactory) ImmutableMap(com.google.common.collect.ImmutableMap) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) StringUtils(org.apache.druid.java.util.common.StringUtils) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) LockGranularity(org.apache.druid.indexing.common.LockGranularity) ExprMacroTable(org.apache.druid.math.expr.ExprMacroTable) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) List(java.util.List) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) ServiceEmitter(org.apache.druid.java.util.emitter.service.ServiceEmitter) DataSegment(org.apache.druid.timeline.DataSegment) SegmentHandoffNotifierFactory(org.apache.druid.segment.handoff.SegmentHandoffNotifierFactory) SegmentAllocateAction(org.apache.druid.indexing.common.actions.SegmentAllocateAction) Intervals(org.apache.druid.java.util.common.Intervals) HashMap(java.util.HashMap) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) FloatDimensionSchema(org.apache.druid.data.input.impl.FloatDimensionSchema) Files(com.google.common.io.Files) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) DimensionSelector(org.apache.druid.segment.DimensionSelector) ExpectedException(org.junit.rules.ExpectedException) SegmentHandoffNotifier(org.apache.druid.segment.handoff.SegmentHandoffNotifier) NoopServiceEmitter(org.apache.druid.server.metrics.NoopServiceEmitter) QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) VirtualColumns(org.apache.druid.segment.VirtualColumns) WindowedStorageAdapter(org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JSONParseSpec(org.apache.druid.data.input.impl.JSONParseSpec) StorageLocationConfig(org.apache.druid.segment.loading.StorageLocationConfig) Granularities(org.apache.druid.java.util.common.granularity.Granularities) TimeUnit(java.util.concurrent.TimeUnit) Rule(org.junit.Rule) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) Cursor(org.apache.druid.segment.Cursor) SegmentCacheManager(org.apache.druid.segment.loading.SegmentCacheManager) IndexIO(org.apache.druid.segment.IndexIO) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) LongDimensionSchema(org.apache.druid.data.input.impl.LongDimensionSchema) FloatDimensionSchema(org.apache.druid.data.input.impl.FloatDimensionSchema) TaskStatus(org.apache.druid.indexer.TaskStatus) BufferedWriter(java.io.BufferedWriter) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) LinkedHashMap(java.util.LinkedHashMap) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) File(java.io.File) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) Test(org.junit.Test)

Example 14 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class IndexTaskTest method testIgnoreParseException.

@Test
public void testIgnoreParseException() throws Exception {
    final File tmpDir = temporaryFolder.newFolder();
    final File tmpFile = File.createTempFile("druid", "index", tmpDir);
    try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
        writer.write("time,d,val\n");
        writer.write("unparseable,a,1\n");
        writer.write("2014-01-01T00:00:10Z,a,1\n");
    }
    final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
    final List<String> columns = Arrays.asList("time", "dim", "val");
    // ignore parse exception
    final IndexTuningConfig tuningConfig = createTuningConfig(2, null, null, null, null, false, false);
    // GranularitySpec.intervals and numShards must be null to verify reportParseException=false is respected both in
    // IndexTask.determineShardSpecs() and IndexTask.generateAndPublishSegments()
    final IndexIngestionSpec parseExceptionIgnoreSpec;
    if (useInputFormatApi) {
        parseExceptionIgnoreSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, DimensionsSpec.EMPTY, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
    } else {
        parseExceptionIgnoreSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, DimensionsSpec.EMPTY, null, columns, true, 0), null, null, tuningConfig, false, false);
    }
    IndexTask indexTask = new IndexTask(null, null, parseExceptionIgnoreSpec, null);
    final List<DataSegment> segments = runTask(indexTask).rhs;
    Assert.assertEquals(Collections.singletonList("d"), segments.get(0).getDimensions());
    Assert.assertEquals(Collections.singletonList("val"), segments.get(0).getMetrics());
    Assert.assertEquals(Intervals.of("2014/P1D"), segments.get(0).getInterval());
}

Also used : IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) File(java.io.File) DataSegment(org.apache.druid.timeline.DataSegment) BufferedWriter(java.io.BufferedWriter) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) Test(org.junit.Test)

Example 15 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class IndexGeneratorJobTest method constructFeed.

@Parameterized.Parameters(name = "useCombiner={0}, partitionType={1}, interval={2}, shardInfoForEachSegment={3}, " + "data={4}, inputFormatName={5}, inputRowParser={6}, maxRowsInMemory={7}, " + "maxBytesInMemory={8}, aggs={9}, datasourceName={10}, forceExtendableShardSpecs={11}")
public static Collection<Object[]> constructFeed() {
    final Object[][] baseConstructors = new Object[][] { { false, "single", "2014-10-22T00:00:00Z/P2D", new String[][][] { { { null, "c.example.com" }, { "c.example.com", "e.example.com" }, { "e.example.com", "g.example.com" }, { "g.example.com", "i.example.com" }, { "i.example.com", null } }, { { null, "c.example.com" }, { "c.example.com", "e.example.com" }, { "e.example.com", "g.example.com" }, { "g.example.com", "i.example.com" }, { "i.example.com", null } } }, ImmutableList.of("2014102200,a.example.com,100", "2014102200,b.exmaple.com,50", "2014102200,c.example.com,200", "2014102200,d.example.com,250", "2014102200,e.example.com,123", "2014102200,f.example.com,567", "2014102200,g.example.com,11", "2014102200,h.example.com,251", "2014102200,i.example.com,963", "2014102200,j.example.com,333", "2014102300,a.example.com,100", "2014102300,b.exmaple.com,50", "2014102300,c.example.com,200", "2014102300,d.example.com,250", "2014102300,e.example.com,123", "2014102300,f.example.com,567", "2014102300,g.example.com,11", "2014102300,h.example.com,251", "2014102300,i.example.com,963", "2014102300,j.example.com,333"), null, new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "visited_num"), false, 0), null), null, null, AGGS1, "website" }, { false, "hashed", "2014-10-22T00:00:00Z/P1D", new Integer[][][] { { { 0, 4 }, { 1, 4 }, { 2, 4 }, { 3, 4 } } }, ImmutableList.of("2014102200,a.example.com,100", "2014102201,b.exmaple.com,50", "2014102202,c.example.com,200", "2014102203,d.example.com,250", "2014102204,e.example.com,123", "2014102205,f.example.com,567", "2014102206,g.example.com,11", "2014102207,h.example.com,251", "2014102208,i.example.com,963", "2014102209,j.example.com,333", "2014102210,k.example.com,253", "2014102211,l.example.com,321", "2014102212,m.example.com,3125", "2014102213,n.example.com,234", "2014102214,o.example.com,325", "2014102215,p.example.com,3533", "2014102216,q.example.com,500", "2014102216,q.example.com,87"), null, new HadoopyStringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "visited_num"), false, 0)), null, null, AGGS1, "website" }, { true, "hashed", "2014-10-22T00:00:00Z/P1D", new Integer[][][] { { { 0, 4 }, { 1, 4 }, { 2, 4 }, { 3, 4 } } }, ImmutableList.of("2014102200,a.example.com,100", "2014102201,b.exmaple.com,50", "2014102202,c.example.com,200", "2014102203,d.example.com,250", "2014102204,e.example.com,123", "2014102205,f.example.com,567", "2014102206,g.example.com,11", "2014102207,h.example.com,251", "2014102208,i.example.com,963", "2014102209,j.example.com,333", "2014102210,k.example.com,253", "2014102211,l.example.com,321", "2014102212,m.example.com,3125", "2014102213,n.example.com,234", "2014102214,o.example.com,325", "2014102215,p.example.com,3533", "2014102216,q.example.com,500", "2014102216,q.example.com,87"), null, new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "visited_num"), false, 0), null), null, null, AGGS1, "website" }, { false, "single", "2014-10-22T00:00:00Z/P2D", new String[][][] { { { null, "c.example.com" }, { "c.example.com", "e.example.com" }, { "e.example.com", "g.example.com" }, { "g.example.com", "i.example.com" }, { "i.example.com", null } }, { { null, "c.example.com" }, { "c.example.com", "e.example.com" }, { "e.example.com", "g.example.com" }, { "g.example.com", "i.example.com" }, { "i.example.com", null } } }, ImmutableList.of("2014102200,a.example.com,100", "2014102200,b.exmaple.com,50", "2014102200,c.example.com,200", "2014102200,d.example.com,250", "2014102200,e.example.com,123", "2014102200,f.example.com,567", "2014102200,g.example.com,11", "2014102200,h.example.com,251", "2014102200,i.example.com,963", "2014102200,j.example.com,333", "2014102300,a.example.com,100", "2014102300,b.exmaple.com,50", "2014102300,c.example.com,200", "2014102300,d.example.com,250", "2014102300,e.example.com,123", "2014102300,f.example.com,567", "2014102300,g.example.com,11", "2014102300,h.example.com,251", "2014102300,i.example.com,963", "2014102300,j.example.com,333"), SequenceFileInputFormat.class.getName(), new HadoopyStringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "visited_num"), false, 0)), null, null, AGGS1, "website" }, { // Tests that new indexes inherit the dimension order from previous index
    false, "hashed", "2014-10-22T00:00:00Z/P1D", new Integer[][][] { { // use a single partition, dimension order inheritance is not supported across partitions
    { 0, 1 } } }, ImmutableList.of("{\"ts\":\"2014102200\", \"X\":\"x.example.com\"}", "{\"ts\":\"2014102201\", \"Y\":\"y.example.com\"}", "{\"ts\":\"2014102202\", \"M\":\"m.example.com\"}", "{\"ts\":\"2014102203\", \"Q\":\"q.example.com\"}", "{\"ts\":\"2014102204\", \"B\":\"b.example.com\"}", "{\"ts\":\"2014102205\", \"F\":\"f.example.com\"}"), null, new StringInputRowParser(new JSONParseSpec(new TimestampSpec("ts", "yyyyMMddHH", null), DimensionsSpec.EMPTY, null, null, null), null), // force 1 row max per index for easier testing
    1, null, AGGS2, "inherit_dims" }, { // Tests that pre-specified dim order is maintained across indexes.
    false, "hashed", "2014-10-22T00:00:00Z/P1D", new Integer[][][] { { { 0, 1 } } }, ImmutableList.of("{\"ts\":\"2014102200\", \"X\":\"x.example.com\"}", "{\"ts\":\"2014102201\", \"Y\":\"y.example.com\"}", "{\"ts\":\"2014102202\", \"M\":\"m.example.com\"}", "{\"ts\":\"2014102203\", \"Q\":\"q.example.com\"}", "{\"ts\":\"2014102204\", \"B\":\"b.example.com\"}", "{\"ts\":\"2014102205\", \"F\":\"f.example.com\"}"), null, new StringInputRowParser(new JSONParseSpec(new TimestampSpec("ts", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("B", "F", "M", "Q", "X", "Y"))), null, null, null), null), // force 1 row max per index for easier testing
    1, null, AGGS2, "inherit_dims2" } };
    // Run each baseConstructor with/without forceExtendableShardSpecs.
    final List<Object[]> constructors = new ArrayList<>();
    for (Object[] baseConstructor : baseConstructors) {
        for (int forceExtendableShardSpecs = 0; forceExtendableShardSpecs < 2; forceExtendableShardSpecs++) {
            final Object[] fullConstructor = new Object[baseConstructor.length + 1];
            System.arraycopy(baseConstructor, 0, fullConstructor, 0, baseConstructor.length);
            fullConstructor[baseConstructor.length] = forceExtendableShardSpecs == 0;
            constructors.add(fullConstructor);
        }
    }
    return constructors;
}

Also used : SequenceFileInputFormat(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat) ArrayList(java.util.ArrayList) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) JSONParseSpec(org.apache.druid.data.input.impl.JSONParseSpec)

Aggregations

CSVParseSpec (org.apache.druid.data.input.impl.CSVParseSpec)16 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)15 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)12 StringInputRowParser (org.apache.druid.data.input.impl.StringInputRowParser)12 Test (org.junit.Test)11 BufferedWriter (java.io.BufferedWriter)9 File (java.io.File)9 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)9 IndexIngestionSpec (org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec)9 IndexTuningConfig (org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig)9 DataSegment (org.apache.druid.timeline.DataSegment)9 ArrayList (java.util.ArrayList)8 JSONParseSpec (org.apache.druid.data.input.impl.JSONParseSpec)7 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)7 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)7 DataSchema (org.apache.druid.segment.indexing.DataSchema)7 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)7 Before (org.junit.Before)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 HashSet (java.util.HashSet)6