use of org.apache.druid.data.input.impl.JsonInputFormat in project druid by druid-io.
the class OssInputSourceTest method testCreateSplitsWithEmptyObjectsIteratingOnlyNonEmptyObjects.
@Test
public void testCreateSplitsWithEmptyObjectsIteratingOnlyNonEmptyObjects() {
EasyMock.reset(OSSCLIENT);
expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)), CONTENT);
expectListObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1)), new byte[0]);
EasyMock.replay(OSSCLIENT);
OssInputSource inputSource = new OssInputSource(OSSCLIENT, INPUT_DATA_CONFIG, null, PREFIXES, null, null);
Stream<InputSplit<List<CloudObjectLocation>>> splits = inputSource.createSplits(new JsonInputFormat(JSONPathSpec.DEFAULT, null, null), null);
Assert.assertEquals(ImmutableList.of(ImmutableList.of(new CloudObjectLocation(EXPECTED_URIS.get(0)))), splits.map(InputSplit::get).collect(Collectors.toList()));
EasyMock.verify(OSSCLIENT);
}
use of org.apache.druid.data.input.impl.JsonInputFormat in project druid by druid-io.
the class SinglePhaseParallelIndexingTest method testIngestBothExplicitAndImplicitDims.
@Test
public void testIngestBothExplicitAndImplicitDims() throws IOException {
final Interval interval = Intervals.of("2017-12/P1M");
for (int i = 0; i < 5; i++) {
try (final Writer writer = Files.newBufferedWriter(new File(inputDir, "test_" + i + ".json").toPath(), StandardCharsets.UTF_8)) {
writer.write(getObjectMapper().writeValueAsString(ImmutableMap.of("ts", StringUtils.format("2017-12-%d", 24 + i), "implicitDim", "implicit_" + i, "explicitDim", "explicit_" + i)));
writer.write(getObjectMapper().writeValueAsString(ImmutableMap.of("ts", StringUtils.format("2017-12-%d", 25 + i), "implicitDim", "implicit_" + i, "explicitDim", "explicit_" + i)));
}
}
final ParallelIndexSupervisorTask task = new ParallelIndexSupervisorTask(null, null, null, new ParallelIndexIngestionSpec(new DataSchema("dataSource", DEFAULT_TIMESTAMP_SPEC, DimensionsSpec.builder().setDefaultSchemaDimensions(ImmutableList.of("ts", "explicitDim")).setIncludeAllDimensions(true).build(), new AggregatorFactory[] { new CountAggregatorFactory("cnt") }, new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, Collections.singletonList(interval)), null), new ParallelIndexIOConfig(null, new SettableSplittableLocalInputSource(inputDir, "*.json", true), new JsonInputFormat(new JSONPathSpec(true, null), null, null), false, null), AbstractParallelIndexSupervisorTaskTest.DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING), null);
task.addToContext(Tasks.FORCE_TIME_CHUNK_LOCK_KEY, lockGranularity == LockGranularity.TIME_CHUNK);
Assert.assertEquals(TaskState.SUCCESS, getIndexingServiceClient().runAndWait(task).getStatusCode());
Set<DataSegment> segments = getIndexingServiceClient().getPublishedSegments(task);
for (DataSegment segment : segments) {
Assert.assertEquals(ImmutableList.of("ts", "explicitDim", "implicitDim"), segment.getDimensions());
}
}
use of org.apache.druid.data.input.impl.JsonInputFormat in project druid by druid-io.
the class S3InputSourceTest method testCreateSplitsWithEmptyObjectsIteratingOnlyNonEmptyObjects.
@Test
public void testCreateSplitsWithEmptyObjectsIteratingOnlyNonEmptyObjects() {
EasyMock.reset(S3_CLIENT);
expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)), CONTENT);
expectListObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1)), new byte[0]);
EasyMock.replay(S3_CLIENT);
S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, null, PREFIXES, null, null);
Stream<InputSplit<List<CloudObjectLocation>>> splits = inputSource.createSplits(new JsonInputFormat(JSONPathSpec.DEFAULT, null, null), null);
Assert.assertEquals(ImmutableList.of(ImmutableList.of(new CloudObjectLocation(EXPECTED_URIS.get(0)))), splits.map(InputSplit::get).collect(Collectors.toList()));
EasyMock.verify(S3_CLIENT);
}
use of org.apache.druid.data.input.impl.JsonInputFormat in project druid by druid-io.
the class S3InputSourceTest method testWithUrisSplit.
@Test
public void testWithUrisSplit() {
S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, EXPECTED_URIS, null, null, null);
Stream<InputSplit<List<CloudObjectLocation>>> splits = inputSource.createSplits(new JsonInputFormat(JSONPathSpec.DEFAULT, null, null), null);
Assert.assertEquals(EXPECTED_COORDS, splits.map(InputSplit::get).collect(Collectors.toList()));
}
use of org.apache.druid.data.input.impl.JsonInputFormat in project druid by druid-io.
the class IndexTaskTest method testMultipleParseExceptionsSuccess.
@Test
public void testMultipleParseExceptionsSuccess() throws Exception {
final File tmpDir = temporaryFolder.newFolder();
final File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
// unparseable time
writer.write("{\"time\":\"unparseable\",\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
// valid row
writer.write("{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
writer.write(// row with invalid long dimension
"{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":\"notnumber\",\"dimFloat\":3.0,\"val\":1}\n");
writer.write(// row with invalid float dimension
"{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":2,\"dimFloat\":\"notnumber\",\"val\":1}\n");
writer.write(// row with invalid metric
"{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":2,\"dimFloat\":4.0,\"val\":\"notnumber\"}\n");
// invalid JSON
writer.write("{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
writer.write(// thrown away
"{\"time\":\"3014-03-01T00:00:10Z\",\"dim\":\"outsideofinterval\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
// unparseable time
writer.write("{\"time\":\"99999999999-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
// invalid JSON
writer.write("this is not JSON\n");
}
final IndexTuningConfig tuningConfig = new IndexTuningConfig(null, null, null, null, null, null, null, null, null, null, new HashedPartitionsSpec(2, null, null), INDEX_SPEC, null, null, true, false, null, null, null, true, 7, 7, null, null);
final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat")));
final IndexIngestionSpec ingestionSpec;
if (useInputFormatApi) {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, dimensionsSpec, new JsonInputFormat(null, null, null), null, null, tuningConfig, false, false);
} else {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new JSONParseSpec(timestampSpec, dimensionsSpec, null, null, null), null, null, tuningConfig, false, false);
}
IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
TaskStatus status = runTask(indexTask).lhs;
Assert.assertEquals(TaskState.SUCCESS, status.getStatusCode());
Assert.assertEquals(null, status.getErrorMsg());
IngestionStatsAndErrorsTaskReportData reportData = getTaskReportData();
Map<String, Object> expectedMetrics = ImmutableMap.of(RowIngestionMeters.DETERMINE_PARTITIONS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 0, RowIngestionMeters.PROCESSED, 4, RowIngestionMeters.UNPARSEABLE, 4, RowIngestionMeters.THROWN_AWAY, 1), RowIngestionMeters.BUILD_SEGMENTS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 3, RowIngestionMeters.PROCESSED, 1, RowIngestionMeters.UNPARSEABLE, 4, RowIngestionMeters.THROWN_AWAY, 1));
Assert.assertEquals(expectedMetrics, reportData.getRowStats());
List<LinkedHashMap> parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.BUILD_SEGMENTS);
List<String> expectedMessages;
if (useInputFormatApi) {
expectedMessages = Arrays.asList(StringUtils.format("Unable to parse row [this is not JSON] (Path: %s, Record: 6, Line: 9)", tmpFile.toURI()), StringUtils.format("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 6, Line: 8)", tmpFile.toURI()), StringUtils.format("Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}] (Path: %s, Record: 5, Line: 6)", tmpFile.toURI()), "Unable to parse value[notnumber] for field[val]", "could not convert value [notnumber] to float", "could not convert value [notnumber] to long", StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 1, Line: 1)", tmpFile.toURI()));
} else {
expectedMessages = Arrays.asList("Unable to parse row [this is not JSON]", "Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}]", "Unable to parse value[notnumber] for field[val]", "could not convert value [notnumber] to float", "could not convert value [notnumber] to long", "Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
}
List<String> actualMessages = parseExceptionReports.stream().map((r) -> {
return ((List<String>) r.get("details")).get(0);
}).collect(Collectors.toList());
Assert.assertEquals(expectedMessages, actualMessages);
List<String> expectedInputs = Arrays.asList("this is not JSON", "{time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}", "{time=2014-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=4.0, val=notnumber}", "{time=2014-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=notnumber, val=1}", "{time=2014-01-01T00:00:10Z, dim=b, dimLong=notnumber, dimFloat=3.0, val=1}", "{time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
List<String> actualInputs = parseExceptionReports.stream().map((r) -> {
return (String) r.get("input");
}).collect(Collectors.toList());
Assert.assertEquals(expectedInputs, actualInputs);
parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.DETERMINE_PARTITIONS);
if (useInputFormatApi) {
expectedMessages = Arrays.asList(StringUtils.format("Unable to parse row [this is not JSON] (Path: %s, Record: 6, Line: 9)", tmpFile.toURI()), StringUtils.format("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 6, Line: 8)", tmpFile.toURI()), StringUtils.format("Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}] (Path: %s, Record: 5, Line: 6)", tmpFile.toURI()), StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 1, Line: 1)", tmpFile.toURI()));
} else {
expectedMessages = Arrays.asList("Unable to parse row [this is not JSON]", "Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}]", "Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
}
actualMessages = parseExceptionReports.stream().map((r) -> {
return ((List<String>) r.get("details")).get(0);
}).collect(Collectors.toList());
Assert.assertEquals(expectedMessages, actualMessages);
expectedInputs = Arrays.asList("this is not JSON", "{time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}", "{time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
actualInputs = parseExceptionReports.stream().map((r) -> {
return (String) r.get("input");
}).collect(Collectors.toList());
Assert.assertEquals(expectedInputs, actualInputs);
}
Aggregations