use of org.apache.druid.data.input.impl.JSONParseSpec in project druid by druid-io.
the class ProtobufParserBenchmark method setup.
@Setup
public void setup() {
nestedParseSpec = new JSONParseSpec(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(Lists.newArrayList(new StringDimensionSchema("event"), new StringDimensionSchema("id"), new StringDimensionSchema("someOtherId"), new StringDimensionSchema("isValid"))), new JSONPathSpec(true, Lists.newArrayList(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "eventType", "eventType"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "foobar", "$.foo.bar"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "bar0", "$.bar[0].bar"))), null, null);
flatParseSpec = new JSONParseSpec(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(Lists.newArrayList(new StringDimensionSchema("event"), new StringDimensionSchema("id"), new StringDimensionSchema("someOtherId"), new StringDimensionSchema("isValid"))), null, null, null);
decoder = new FileBasedProtobufBytesDecoder("prototest.desc", "ProtoTestEvent");
protoFilePath = "ProtoFile";
protoInputs = getProtoInputs(protoFilePath);
nestedParser = new ProtobufInputRowParser(nestedParseSpec, decoder, null, null);
flatParser = new ProtobufInputRowParser(flatParseSpec, decoder, null, null);
}
use of org.apache.druid.data.input.impl.JSONParseSpec in project druid by druid-io.
the class IndexTaskTest method testMultipleParseExceptionsSuccess.
@Test
public void testMultipleParseExceptionsSuccess() throws Exception {
final File tmpDir = temporaryFolder.newFolder();
final File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
// unparseable time
writer.write("{\"time\":\"unparseable\",\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
// valid row
writer.write("{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
writer.write(// row with invalid long dimension
"{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":\"notnumber\",\"dimFloat\":3.0,\"val\":1}\n");
writer.write(// row with invalid float dimension
"{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":2,\"dimFloat\":\"notnumber\",\"val\":1}\n");
writer.write(// row with invalid metric
"{\"time\":\"2014-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":2,\"dimFloat\":4.0,\"val\":\"notnumber\"}\n");
// invalid JSON
writer.write("{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
writer.write(// thrown away
"{\"time\":\"3014-03-01T00:00:10Z\",\"dim\":\"outsideofinterval\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
// unparseable time
writer.write("{\"time\":\"99999999999-01-01T00:00:10Z\",\"dim\":\"b\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}\n");
// invalid JSON
writer.write("this is not JSON\n");
}
final IndexTuningConfig tuningConfig = new IndexTuningConfig(null, null, null, null, null, null, null, null, null, null, new HashedPartitionsSpec(2, null, null), INDEX_SPEC, null, null, true, false, null, null, null, true, 7, 7, null, null);
final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat")));
final IndexIngestionSpec ingestionSpec;
if (useInputFormatApi) {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, dimensionsSpec, new JsonInputFormat(null, null, null), null, null, tuningConfig, false, false);
} else {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new JSONParseSpec(timestampSpec, dimensionsSpec, null, null, null), null, null, tuningConfig, false, false);
}
IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
TaskStatus status = runTask(indexTask).lhs;
Assert.assertEquals(TaskState.SUCCESS, status.getStatusCode());
Assert.assertEquals(null, status.getErrorMsg());
IngestionStatsAndErrorsTaskReportData reportData = getTaskReportData();
Map<String, Object> expectedMetrics = ImmutableMap.of(RowIngestionMeters.DETERMINE_PARTITIONS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 0, RowIngestionMeters.PROCESSED, 4, RowIngestionMeters.UNPARSEABLE, 4, RowIngestionMeters.THROWN_AWAY, 1), RowIngestionMeters.BUILD_SEGMENTS, ImmutableMap.of(RowIngestionMeters.PROCESSED_WITH_ERROR, 3, RowIngestionMeters.PROCESSED, 1, RowIngestionMeters.UNPARSEABLE, 4, RowIngestionMeters.THROWN_AWAY, 1));
Assert.assertEquals(expectedMetrics, reportData.getRowStats());
List<LinkedHashMap> parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.BUILD_SEGMENTS);
List<String> expectedMessages;
if (useInputFormatApi) {
expectedMessages = Arrays.asList(StringUtils.format("Unable to parse row [this is not JSON] (Path: %s, Record: 6, Line: 9)", tmpFile.toURI()), StringUtils.format("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 6, Line: 8)", tmpFile.toURI()), StringUtils.format("Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}] (Path: %s, Record: 5, Line: 6)", tmpFile.toURI()), "Unable to parse value[notnumber] for field[val]", "could not convert value [notnumber] to float", "could not convert value [notnumber] to long", StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 1, Line: 1)", tmpFile.toURI()));
} else {
expectedMessages = Arrays.asList("Unable to parse row [this is not JSON]", "Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}]", "Unable to parse value[notnumber] for field[val]", "could not convert value [notnumber] to float", "could not convert value [notnumber] to long", "Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
}
List<String> actualMessages = parseExceptionReports.stream().map((r) -> {
return ((List<String>) r.get("details")).get(0);
}).collect(Collectors.toList());
Assert.assertEquals(expectedMessages, actualMessages);
List<String> expectedInputs = Arrays.asList("this is not JSON", "{time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}", "{time=2014-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=4.0, val=notnumber}", "{time=2014-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=notnumber, val=1}", "{time=2014-01-01T00:00:10Z, dim=b, dimLong=notnumber, dimFloat=3.0, val=1}", "{time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
List<String> actualInputs = parseExceptionReports.stream().map((r) -> {
return (String) r.get("input");
}).collect(Collectors.toList());
Assert.assertEquals(expectedInputs, actualInputs);
parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.DETERMINE_PARTITIONS);
if (useInputFormatApi) {
expectedMessages = Arrays.asList(StringUtils.format("Unable to parse row [this is not JSON] (Path: %s, Record: 6, Line: 9)", tmpFile.toURI()), StringUtils.format("Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 6, Line: 8)", tmpFile.toURI()), StringUtils.format("Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}] (Path: %s, Record: 5, Line: 6)", tmpFile.toURI()), StringUtils.format("Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1} (Path: %s, Record: 1, Line: 1)", tmpFile.toURI()));
} else {
expectedMessages = Arrays.asList("Unable to parse row [this is not JSON]", "Timestamp[99999999999-01-01T00:00:10Z] is unparseable! Event: {time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "Unable to parse row [{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}]", "Timestamp[unparseable] is unparseable! Event: {time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
}
actualMessages = parseExceptionReports.stream().map((r) -> {
return ((List<String>) r.get("details")).get(0);
}).collect(Collectors.toList());
Assert.assertEquals(expectedMessages, actualMessages);
expectedInputs = Arrays.asList("this is not JSON", "{time=99999999999-01-01T00:00:10Z, dim=b, dimLong=2, dimFloat=3.0, val=1}", "{\"time\":9.0x,\"dim\":\"a\",\"dimLong\":2,\"dimFloat\":3.0,\"val\":1}", "{time=unparseable, dim=a, dimLong=2, dimFloat=3.0, val=1}");
actualInputs = parseExceptionReports.stream().map((r) -> {
return (String) r.get("input");
}).collect(Collectors.toList());
Assert.assertEquals(expectedInputs, actualInputs);
}
use of org.apache.druid.data.input.impl.JSONParseSpec in project druid by druid-io.
the class IndexGeneratorJobTest method constructFeed.
@Parameterized.Parameters(name = "useCombiner={0}, partitionType={1}, interval={2}, shardInfoForEachSegment={3}, " + "data={4}, inputFormatName={5}, inputRowParser={6}, maxRowsInMemory={7}, " + "maxBytesInMemory={8}, aggs={9}, datasourceName={10}, forceExtendableShardSpecs={11}")
public static Collection<Object[]> constructFeed() {
final Object[][] baseConstructors = new Object[][] { { false, "single", "2014-10-22T00:00:00Z/P2D", new String[][][] { { { null, "c.example.com" }, { "c.example.com", "e.example.com" }, { "e.example.com", "g.example.com" }, { "g.example.com", "i.example.com" }, { "i.example.com", null } }, { { null, "c.example.com" }, { "c.example.com", "e.example.com" }, { "e.example.com", "g.example.com" }, { "g.example.com", "i.example.com" }, { "i.example.com", null } } }, ImmutableList.of("2014102200,a.example.com,100", "2014102200,b.exmaple.com,50", "2014102200,c.example.com,200", "2014102200,d.example.com,250", "2014102200,e.example.com,123", "2014102200,f.example.com,567", "2014102200,g.example.com,11", "2014102200,h.example.com,251", "2014102200,i.example.com,963", "2014102200,j.example.com,333", "2014102300,a.example.com,100", "2014102300,b.exmaple.com,50", "2014102300,c.example.com,200", "2014102300,d.example.com,250", "2014102300,e.example.com,123", "2014102300,f.example.com,567", "2014102300,g.example.com,11", "2014102300,h.example.com,251", "2014102300,i.example.com,963", "2014102300,j.example.com,333"), null, new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "visited_num"), false, 0), null), null, null, AGGS1, "website" }, { false, "hashed", "2014-10-22T00:00:00Z/P1D", new Integer[][][] { { { 0, 4 }, { 1, 4 }, { 2, 4 }, { 3, 4 } } }, ImmutableList.of("2014102200,a.example.com,100", "2014102201,b.exmaple.com,50", "2014102202,c.example.com,200", "2014102203,d.example.com,250", "2014102204,e.example.com,123", "2014102205,f.example.com,567", "2014102206,g.example.com,11", "2014102207,h.example.com,251", "2014102208,i.example.com,963", "2014102209,j.example.com,333", "2014102210,k.example.com,253", "2014102211,l.example.com,321", "2014102212,m.example.com,3125", "2014102213,n.example.com,234", "2014102214,o.example.com,325", "2014102215,p.example.com,3533", "2014102216,q.example.com,500", "2014102216,q.example.com,87"), null, new HadoopyStringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "visited_num"), false, 0)), null, null, AGGS1, "website" }, { true, "hashed", "2014-10-22T00:00:00Z/P1D", new Integer[][][] { { { 0, 4 }, { 1, 4 }, { 2, 4 }, { 3, 4 } } }, ImmutableList.of("2014102200,a.example.com,100", "2014102201,b.exmaple.com,50", "2014102202,c.example.com,200", "2014102203,d.example.com,250", "2014102204,e.example.com,123", "2014102205,f.example.com,567", "2014102206,g.example.com,11", "2014102207,h.example.com,251", "2014102208,i.example.com,963", "2014102209,j.example.com,333", "2014102210,k.example.com,253", "2014102211,l.example.com,321", "2014102212,m.example.com,3125", "2014102213,n.example.com,234", "2014102214,o.example.com,325", "2014102215,p.example.com,3533", "2014102216,q.example.com,500", "2014102216,q.example.com,87"), null, new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "visited_num"), false, 0), null), null, null, AGGS1, "website" }, { false, "single", "2014-10-22T00:00:00Z/P2D", new String[][][] { { { null, "c.example.com" }, { "c.example.com", "e.example.com" }, { "e.example.com", "g.example.com" }, { "g.example.com", "i.example.com" }, { "i.example.com", null } }, { { null, "c.example.com" }, { "c.example.com", "e.example.com" }, { "e.example.com", "g.example.com" }, { "g.example.com", "i.example.com" }, { "i.example.com", null } } }, ImmutableList.of("2014102200,a.example.com,100", "2014102200,b.exmaple.com,50", "2014102200,c.example.com,200", "2014102200,d.example.com,250", "2014102200,e.example.com,123", "2014102200,f.example.com,567", "2014102200,g.example.com,11", "2014102200,h.example.com,251", "2014102200,i.example.com,963", "2014102200,j.example.com,333", "2014102300,a.example.com,100", "2014102300,b.exmaple.com,50", "2014102300,c.example.com,200", "2014102300,d.example.com,250", "2014102300,e.example.com,123", "2014102300,f.example.com,567", "2014102300,g.example.com,11", "2014102300,h.example.com,251", "2014102300,i.example.com,963", "2014102300,j.example.com,333"), SequenceFileInputFormat.class.getName(), new HadoopyStringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "visited_num"), false, 0)), null, null, AGGS1, "website" }, { // Tests that new indexes inherit the dimension order from previous index
false, "hashed", "2014-10-22T00:00:00Z/P1D", new Integer[][][] { { // use a single partition, dimension order inheritance is not supported across partitions
{ 0, 1 } } }, ImmutableList.of("{\"ts\":\"2014102200\", \"X\":\"x.example.com\"}", "{\"ts\":\"2014102201\", \"Y\":\"y.example.com\"}", "{\"ts\":\"2014102202\", \"M\":\"m.example.com\"}", "{\"ts\":\"2014102203\", \"Q\":\"q.example.com\"}", "{\"ts\":\"2014102204\", \"B\":\"b.example.com\"}", "{\"ts\":\"2014102205\", \"F\":\"f.example.com\"}"), null, new StringInputRowParser(new JSONParseSpec(new TimestampSpec("ts", "yyyyMMddHH", null), DimensionsSpec.EMPTY, null, null, null), null), // force 1 row max per index for easier testing
1, null, AGGS2, "inherit_dims" }, { // Tests that pre-specified dim order is maintained across indexes.
false, "hashed", "2014-10-22T00:00:00Z/P1D", new Integer[][][] { { { 0, 1 } } }, ImmutableList.of("{\"ts\":\"2014102200\", \"X\":\"x.example.com\"}", "{\"ts\":\"2014102201\", \"Y\":\"y.example.com\"}", "{\"ts\":\"2014102202\", \"M\":\"m.example.com\"}", "{\"ts\":\"2014102203\", \"Q\":\"q.example.com\"}", "{\"ts\":\"2014102204\", \"B\":\"b.example.com\"}", "{\"ts\":\"2014102205\", \"F\":\"f.example.com\"}"), null, new StringInputRowParser(new JSONParseSpec(new TimestampSpec("ts", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("B", "F", "M", "Q", "X", "Y"))), null, null, null), null), // force 1 row max per index for easier testing
1, null, AGGS2, "inherit_dims2" } };
// Run each baseConstructor with/without forceExtendableShardSpecs.
final List<Object[]> constructors = new ArrayList<>();
for (Object[] baseConstructor : baseConstructors) {
for (int forceExtendableShardSpecs = 0; forceExtendableShardSpecs < 2; forceExtendableShardSpecs++) {
final Object[] fullConstructor = new Object[baseConstructor.length + 1];
System.arraycopy(baseConstructor, 0, fullConstructor, 0, baseConstructor.length);
fullConstructor[baseConstructor.length] = forceExtendableShardSpecs == 0;
constructors.add(fullConstructor);
}
}
return constructors;
}
use of org.apache.druid.data.input.impl.JSONParseSpec in project druid by druid-io.
the class StreamChunkParserTest method testBothParserAndInputFormatParseProperlyUsingInputFormat.
@Test
public void testBothParserAndInputFormatParseProperlyUsingInputFormat() throws IOException {
final InputRowParser<ByteBuffer> parser = new StringInputRowParser(new JSONParseSpec(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, JSONPathSpec.DEFAULT, Collections.emptyMap(), false), StringUtils.UTF8_STRING);
final TrackingJsonInputFormat inputFormat = new TrackingJsonInputFormat(JSONPathSpec.DEFAULT, Collections.emptyMap());
final StreamChunkParser<ByteEntity> chunkParser = new StreamChunkParser<>(parser, inputFormat, new InputRowSchema(TIMESTAMP_SPEC, DimensionsSpec.EMPTY, ColumnsFilter.all()), TransformSpec.NONE, temporaryFolder.newFolder(), row -> true, rowIngestionMeters, parseExceptionHandler);
parseAndAssertResult(chunkParser);
Assert.assertTrue(inputFormat.props.used);
}
use of org.apache.druid.data.input.impl.JSONParseSpec in project druid by druid-io.
the class RealtimePlumberSchoolTest method setUp.
@Before
public void setUp() throws Exception {
tmpDir = FileUtils.createTempDir();
ObjectMapper jsonMapper = new DefaultObjectMapper();
schema = new DataSchema("test", jsonMapper.convertValue(new StringInputRowParser(new JSONParseSpec(new TimestampSpec("timestamp", "auto", null), DimensionsSpec.EMPTY, null, null, null), null), Map.class), new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, null), null, jsonMapper);
schema2 = new DataSchema("test", jsonMapper.convertValue(new StringInputRowParser(new JSONParseSpec(new TimestampSpec("timestamp", "auto", null), DimensionsSpec.EMPTY, null, null, null), null), Map.class), new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.YEAR, Granularities.NONE, null), null, jsonMapper);
announcer = EasyMock.createMock(DataSegmentAnnouncer.class);
announcer.announceSegment(EasyMock.anyObject());
EasyMock.expectLastCall().anyTimes();
segmentPublisher = EasyMock.createNiceMock(SegmentPublisher.class);
dataSegmentPusher = EasyMock.createNiceMock(DataSegmentPusher.class);
handoffNotifierFactory = EasyMock.createNiceMock(SegmentHandoffNotifierFactory.class);
handoffNotifier = EasyMock.createNiceMock(SegmentHandoffNotifier.class);
EasyMock.expect(handoffNotifierFactory.createSegmentHandoffNotifier(EasyMock.anyString())).andReturn(handoffNotifier).anyTimes();
EasyMock.expect(handoffNotifier.registerSegmentHandoffCallback(EasyMock.anyObject(), EasyMock.anyObject(), EasyMock.anyObject())).andReturn(true).anyTimes();
emitter = EasyMock.createMock(ServiceEmitter.class);
EasyMock.replay(announcer, segmentPublisher, dataSegmentPusher, handoffNotifierFactory, handoffNotifier, emitter);
tuningConfig = new RealtimeTuningConfig(null, 1, null, null, null, null, null, new IntervalStartVersioningPolicy(), rejectionPolicy, null, null, null, null, 0, 0, false, null, null, null, null);
realtimePlumberSchool = new RealtimePlumberSchool(emitter, new DefaultQueryRunnerFactoryConglomerate(new HashMap<>()), dataSegmentPusher, announcer, segmentPublisher, handoffNotifierFactory, DirectQueryProcessingPool.INSTANCE, NoopJoinableFactory.INSTANCE, TestHelper.getTestIndexMergerV9(segmentWriteOutMediumFactory), TestHelper.getTestIndexIO(), MapCache.create(0), FireDepartmentTest.NO_CACHE_CONFIG, new CachePopulatorStats(), TestHelper.makeJsonMapper());
metrics = new FireDepartmentMetrics();
plumber = (RealtimePlumber) realtimePlumberSchool.findPlumber(schema, tuningConfig, metrics);
}
Aggregations