use of org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec in project druid by druid-io.
the class IndexTaskTest method testReportParseException.
@Test
public void testReportParseException() throws Exception {
final File tmpDir = temporaryFolder.newFolder();
final File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
writer.write("time,d,val\n");
writer.write("unparseable,a,1\n");
writer.write("2014-01-01T00:00:10Z,a,1\n");
}
final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
final List<String> columns = Arrays.asList("time", "dim", "val");
// report parse exception
final IndexTuningConfig tuningConfig = createTuningConfig(2, null, null, null, null, false, true);
final IndexIngestionSpec indexIngestionSpec;
List<String> expectedMessages;
if (useInputFormatApi) {
indexIngestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, DimensionsSpec.EMPTY, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
expectedMessages = ImmutableList.of(StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, d=a, val=1} (Path: %s, Record: 1, Line: 2)", tmpFile.toURI()));
} else {
indexIngestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, DimensionsSpec.EMPTY, null, columns, true, 0), null, null, tuningConfig, false, false);
expectedMessages = ImmutableList.of("Timestamp[unparseable] is unparseable! Event: {time=unparseable, d=a, val=1}");
}
IndexTask indexTask = new IndexTask(null, null, indexIngestionSpec, null);
TaskStatus status = runTask(indexTask).lhs;
Assert.assertEquals(TaskState.FAILED, status.getStatusCode());
checkTaskStatusErrorMsgForParseExceptionsExceeded(status);
IngestionStatsAndErrorsTaskReportData reportData = getTaskReportData();
List<LinkedHashMap> parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.BUILD_SEGMENTS);
List<String> actualMessages = parseExceptionReports.stream().map((r) -> {
return ((List<String>) r.get("details")).get(0);
}).collect(Collectors.toList());
Assert.assertEquals(expectedMessages, actualMessages);
List<String> expectedInputs = ImmutableList.of("{time=unparseable, d=a, val=1}");
List<String> actualInputs = parseExceptionReports.stream().map((r) -> {
return (String) r.get("input");
}).collect(Collectors.toList());
Assert.assertEquals(expectedInputs, actualInputs);
}
use of org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec in project druid by druid-io.
the class IndexTaskTest method testMultipleParseExceptionsSuccess.
@Test
public void testMultipleParseExceptionsSuccess() throws Exception {
final File tmpDir = temporaryFolder.newFolder();
final File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
// unparseable time
writer.write("{\"time\":\"unparseable\",\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
// valid row
writer.write("{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
writer.write(// row with invalid long dimension
"{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":\"notnumber\",\"dimFloat\":3.0,\"val\":1}\n");
writer.write(// row with invalid float dimension
"{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":2,\"dimFloat\":\"notnumber\",\"val\":1}\n");
writer.write(// row with invalid metric
"{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":2,\"dimFloat\":4.0,\"val\":\"notnumber\"}\n");
// invalid JSON
writer.write("{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
writer.write(// thrown away
"{\"time\":\"3014-03-01T00:00:10Z\",\"dim\":\"outsideofinterval\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
// unparseable time
writer.write("{\"time\":\"99999999999-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
// invalid JSON
writer.write("this is not JSON\n");
}
final IndexTuningConfig tuningConfig = new IndexTuningConfig(null, null, null, null, null, null, null, null, null, null, new HashedPartitionsSpec(2, null, null), INDEX_SPEC, null, null, true, false, null, null, null, true, 7, 7, null, null);
final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat")));
final IndexIngestionSpec ingestionSpec;
if (useInputFormatApi) {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, dimensionsSpec, new JsonInputFormat(null, null, null), null, null, tuningConfig, false, false);
} else {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new JSONParseSpec(timestampSpec, dimensionsSpec, null, null, null), null, null, tuningConfig, false, false);
}
IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
TaskStatus status = runTask(indexTask).lhs;
Assert.assertEquals(TaskState.SUCCESS, status.getStatusCode());
Assert.assertEquals(null, status.getErrorMsg());
IngestionStatsAndErrorsTaskReportData reportData = getTaskReportData();
Map<String, Object> expectedMetrics = ImmutableMap.of(RowIngestionMeters.DETERMINE_PARTITIONS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 0, RowIngestionMeters.PROCESSED, 4, RowIngestionMeters.UNPARSEABLE, 4, RowIngestionMeters.THROWN_AWAY, 1), RowIngestionMeters.BUILD_SEGMENTS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 3, RowIngestionMeters.PROCESSED, 1, RowIngestionMeters.UNPARSEABLE, 4, RowIngestionMeters.THROWN_AWAY, 1));
Assert.assertEquals(expectedMetrics, reportData.getRowStats());
List<LinkedHashMap> parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.BUILD_SEGMENTS);
List<String> expectedMessages;
if (useInputFormatApi) {
expectedMessages = Arrays.asList(StringUtils.format("Unable to parse row [this is not JSON] (Path: %s, Record: 6, Line: 9)", tmpFile.toURI()), StringUtils.format("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 6, Line: 8)", tmpFile.toURI()), StringUtils.format("Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}] (Path: %s, Record: 5, Line: 6)", tmpFile.toURI()), "Unable to parse value[notnumber] for field[val]", "could not convert value [notnumber] to float", "could not convert value [notnumber] to long", StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 1, Line: 1)", tmpFile.toURI()));
} else {
expectedMessages = Arrays.asList("Unable to parse row [this is not JSON]", "Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}]", "Unable to parse value[notnumber] for field[val]", "could not convert value [notnumber] to float", "could not convert value [notnumber] to long", "Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
}
List<String> actualMessages = parseExceptionReports.stream().map((r) -> {
return ((List<String>) r.get("details")).get(0);
}).collect(Collectors.toList());
Assert.assertEquals(expectedMessages, actualMessages);
List<String> expectedInputs = Arrays.asList("this is not JSON", "{time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}", "{time=2014-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=4.0, val=notnumber}", "{time=2014-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=notnumber, val=1}", "{time=2014-01-01T00:00:10Z, dim=b, dimLong=notnumber, dimFloat=3.0, val=1}", "{time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
List<String> actualInputs = parseExceptionReports.stream().map((r) -> {
return (String) r.get("input");
}).collect(Collectors.toList());
Assert.assertEquals(expectedInputs, actualInputs);
parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.DETERMINE_PARTITIONS);
if (useInputFormatApi) {
expectedMessages = Arrays.asList(StringUtils.format("Unable to parse row [this is not JSON] (Path: %s, Record: 6, Line: 9)", tmpFile.toURI()), StringUtils.format("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 6, Line: 8)", tmpFile.toURI()), StringUtils.format("Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}] (Path: %s, Record: 5, Line: 6)", tmpFile.toURI()), StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 1, Line: 1)", tmpFile.toURI()));
} else {
expectedMessages = Arrays.asList("Unable to parse row [this is not JSON]", "Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}]", "Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
}
actualMessages = parseExceptionReports.stream().map((r) -> {
return ((List<String>) r.get("details")).get(0);
}).collect(Collectors.toList());
Assert.assertEquals(expectedMessages, actualMessages);
expectedInputs = Arrays.asList("this is not JSON", "{time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}", "{time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
actualInputs = parseExceptionReports.stream().map((r) -> {
return (String) r.get("input");
}).collect(Collectors.toList());
Assert.assertEquals(expectedInputs, actualInputs);
}
use of org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec in project druid by druid-io.
the class IndexTaskTest method testMultipleParseExceptionsFailureAtDeterminePartitions.
@Test
public void testMultipleParseExceptionsFailureAtDeterminePartitions() throws Exception {
final File tmpDir = temporaryFolder.newFolder();
final File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
writer.write("time,dim,dimLong,dimFloat,val\n");
// unparseable
writer.write("unparseable,a,2,3.0,1\n");
// valid row
writer.write("2014-01-01T00:00:10Z,a,2,3.0,1\n");
// unparseable
writer.write("9.0,a,2,3.0,1\n");
// thrown away
writer.write("3014-03-01T00:00:10Z,outsideofinterval,2,3.0,1\n");
// unparseable
writer.write("99999999999-01-01T00:00:10Z,b,2,3.0,1\n");
}
// Allow up to 3 parse exceptions, and save up to 2 parse exceptions
final IndexTuningConfig tuningConfig = new IndexTuningConfig(null, null, null, null, null, null, null, null, null, null, new HashedPartitionsSpec(2, null, null), INDEX_SPEC, null, null, true, false, null, null, null, true, 2, 5, null, null);
final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat")));
final List<String> columns = Arrays.asList("time", "dim", "dimLong", "dimFloat", "val");
final IndexIngestionSpec ingestionSpec;
List<String> expectedMessages;
if (useInputFormatApi) {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, dimensionsSpec, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
expectedMessages = Arrays.asList(StringUtils.format("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 3, Line: 6)", tmpFile.toURI()), StringUtils.format("Timestamp[9.0] is unparseable! Event: {time=9.0, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 2, Line: 4)", tmpFile.toURI()), StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 1, Line: 2)", tmpFile.toURI()));
} else {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, dimensionsSpec, null, columns, true, 0), null, null, tuningConfig, false, false);
expectedMessages = Arrays.asList("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "Timestamp[9.0] is unparseable! Event: {time=9.0, dim=a, dimLong=2, dimFloat=3.0, val=1}", "Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
}
IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
TaskStatus status = runTask(indexTask).lhs;
Assert.assertEquals(TaskState.FAILED, status.getStatusCode());
checkTaskStatusErrorMsgForParseExceptionsExceeded(status);
IngestionStatsAndErrorsTaskReportData reportData = getTaskReportData();
Map<String, Object> expectedMetrics = ImmutableMap.of(RowIngestionMeters.DETERMINE_PARTITIONS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 0, RowIngestionMeters.PROCESSED, 1, RowIngestionMeters.UNPARSEABLE, 3, RowIngestionMeters.THROWN_AWAY, useInputFormatApi ? 1 : 2), RowIngestionMeters.BUILD_SEGMENTS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 0, RowIngestionMeters.PROCESSED, 0, RowIngestionMeters.UNPARSEABLE, 0, RowIngestionMeters.THROWN_AWAY, 0));
Assert.assertEquals(expectedMetrics, reportData.getRowStats());
List<LinkedHashMap> parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.DETERMINE_PARTITIONS);
List<String> actualMessages = parseExceptionReports.stream().map((r) -> {
return ((List<String>) r.get("details")).get(0);
}).collect(Collectors.toList());
Assert.assertEquals(expectedMessages, actualMessages);
List<String> expectedInputs = Arrays.asList("{time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "{time=9.0, dim=a, dimLong=2, dimFloat=3.0, val=1}", "{time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
List<String> actualInputs = parseExceptionReports.stream().map((r) -> {
return (String) r.get("input");
}).collect(Collectors.toList());
Assert.assertEquals(expectedInputs, actualInputs);
}
use of org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec in project druid by druid-io.
the class IndexTaskTest method testIgnoreParseException.
@Test
public void testIgnoreParseException() throws Exception {
final File tmpDir = temporaryFolder.newFolder();
final File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
writer.write("time,d,val\n");
writer.write("unparseable,a,1\n");
writer.write("2014-01-01T00:00:10Z,a,1\n");
}
final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
final List<String> columns = Arrays.asList("time", "dim", "val");
// ignore parse exception
final IndexTuningConfig tuningConfig = createTuningConfig(2, null, null, null, null, false, false);
// GranularitySpec.intervals and numShards must be null to verify reportParseException=false is respected both in
// IndexTask.determineShardSpecs() and IndexTask.generateAndPublishSegments()
final IndexIngestionSpec parseExceptionIgnoreSpec;
if (useInputFormatApi) {
parseExceptionIgnoreSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, DimensionsSpec.EMPTY, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
} else {
parseExceptionIgnoreSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, DimensionsSpec.EMPTY, null, columns, true, 0), null, null, tuningConfig, false, false);
}
IndexTask indexTask = new IndexTask(null, null, parseExceptionIgnoreSpec, null);
final List<DataSegment> segments = runTask(indexTask).rhs;
Assert.assertEquals(Collections.singletonList("d"), segments.get(0).getDimensions());
Assert.assertEquals(Collections.singletonList("val"), segments.get(0).getMetrics());
Assert.assertEquals(Intervals.of("2014/P1D"), segments.get(0).getInterval());
}
use of org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec in project druid by druid-io.
the class TaskLifecycleTest method testIndexTaskFailure.
@Test
public void testIndexTaskFailure() throws Exception {
final Task indexTask = new IndexTask(null, null, new IndexIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[] { new DoubleSumAggregatorFactory("met", "met") }, new UniformGranularitySpec(Granularities.DAY, null, ImmutableList.of(Intervals.of("2010-01-01/P1D"))), null, mapper), new IndexIOConfig(null, new MockExceptionInputSource(), new NoopInputFormat(), false, false), new IndexTuningConfig(null, 10000, null, 10, null, null, null, null, null, null, null, indexSpec, null, 3, false, null, null, null, null, null, null, null, null, null)), null);
final TaskStatus status = runTask(indexTask);
Assert.assertEquals("statusCode", TaskState.FAILED, status.getStatusCode());
Assert.assertEquals(taskLocation, status.getLocation());
Assert.assertEquals("num segments published", 0, mdc.getPublished().size());
Assert.assertEquals("num segments nuked", 0, mdc.getNuked().size());
}
Aggregations