use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class InputSourceSamplerTest method testWithTimestampSpec.
@Test
public void testWithTimestampSpec() throws IOException {
final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(null);
final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, null, null, null);
final InputSource inputSource = createInputSource(getTestRows(), dataSchema);
final InputFormat inputFormat = createInputFormat();
SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
Assert.assertEquals(6, response.getNumRowsRead());
Assert.assertEquals(5, response.getNumRowsIndexed());
Assert.assertEquals(6, response.getData().size());
List<SamplerResponseRow> data = response.getData();
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim2", null).put("dim1", "foo").put("met1", "1").build(), null, null), data.get(0));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(1), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim2", null).put("dim1", "foo").put("met1", "2").build(), null, null), data.get(1));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(2), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934460000L).put("dim2", null).put("dim1", "foo").put("met1", "3").build(), null, null), data.get(2));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim2", null).put("dim1", "foo2").put("met1", "4").build(), null, null), data.get(3));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(4), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim2", "bar").put("dim1", "foo").put("met1", "5").build(), null, null), data.get(4));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(5));
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class InputSourceSamplerTest method testWithTransformsDimensionsSpec.
@Test
public void testWithTransformsDimensionsSpec() throws IOException {
final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(ImmutableList.of(StringDimensionSchema.create("dim1PlusBar")));
final TransformSpec transformSpec = new TransformSpec(null, ImmutableList.of(new ExpressionTransform("dim1PlusBar", "concat(dim1 + 'bar')", TestExprMacroTable.INSTANCE)));
final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, true, null);
final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, transformSpec);
final InputSource inputSource = createInputSource(getTestRows(), dataSchema);
final InputFormat inputFormat = createInputFormat();
SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
Assert.assertEquals(6, response.getNumRowsRead());
Assert.assertEquals(5, response.getNumRowsIndexed());
Assert.assertEquals(3, response.getData().size());
List<SamplerResponseRow> data = response.getData();
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foobar").put("met1", 11L).build(), null, null), data.get(0));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foo2bar").put("met1", 4L).build(), null, null), data.get(1));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(2));
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class InputSourceSamplerTest method testIndexParseException.
@Test
public void testIndexParseException() throws IOException {
final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(ImmutableList.of(StringDimensionSchema.create("dim1PlusBar")));
final TransformSpec transformSpec = new TransformSpec(null, ImmutableList.of(new ExpressionTransform("dim1PlusBar", "concat(dim1 + 'bar')", TestExprMacroTable.INSTANCE)));
final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, true, null);
final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, transformSpec);
//
// add a invalid row to cause parse exception when indexing
//
Map<String, Object> rawColumns4ParseExceptionRow = ImmutableMap.of("t", "2019-04-22T12:00", "dim1", "foo2", "met1", "invalidNumber");
final List<String> inputTestRows = Lists.newArrayList(getTestRows());
inputTestRows.add(ParserType.STR_CSV.equals(parserType) ? "2019-04-22T12:00,foo2,,invalidNumber" : OBJECT_MAPPER.writeValueAsString(rawColumns4ParseExceptionRow));
final InputSource inputSource = createInputSource(inputTestRows, dataSchema);
final InputFormat inputFormat = createInputFormat();
SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
Assert.assertEquals(7, response.getNumRowsRead());
Assert.assertEquals(5, response.getNumRowsIndexed());
Assert.assertEquals(4, response.getData().size());
List<SamplerResponseRow> data = response.getData();
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foobar").put("met1", 11L).build(), null, null), data.get(0));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foo2bar").put("met1", 4L).build(), null, null), data.get(1));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(2));
//
// the last row has parse exception when indexing, check if rawColumns and exception message match the expected
//
String indexParseExceptioMessage = ParserType.STR_CSV.equals(parserType) ? "Found unparseable columns in row: [SamplerInputRow{row=TransformedInputRow{row=MapBasedInputRow{timestamp=2019-04-22T12:00:00.000Z, event={t=2019-04-22T12:00, dim1=foo2, dim2=null, met1=invalidNumber}, dimensions=[dim1PlusBar]}}}], exceptions: [Unable to parse value[invalidNumber] for field[met1]]" : "Found unparseable columns in row: [SamplerInputRow{row=TransformedInputRow{row=MapBasedInputRow{timestamp=2019-04-22T12:00:00.000Z, event={t=2019-04-22T12:00, dim1=foo2, met1=invalidNumber}, dimensions=[dim1PlusBar]}}}], exceptions: [Unable to parse value[invalidNumber] for field[met1]]";
assertEqualsSamplerResponseRow(new SamplerResponseRow(rawColumns4ParseExceptionRow, null, true, indexParseExceptioMessage), data.get(3));
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class KafkaIndexTaskTest method testKafkaRecordEntityInputFormat.
@Test(timeout = 60_000L)
public void testKafkaRecordEntityInputFormat() throws Exception {
// Insert data
insertData(Iterables.limit(records, 3));
final KafkaIndexTask task = createTask(null, new DataSchema("test_ds", new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim1"), new StringDimensionSchema("dim1t"), new StringDimensionSchema("dim2"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat"), new StringDimensionSchema("kafka.topic"), new LongDimensionSchema("kafka.offset"), new StringDimensionSchema("kafka.header.encoding"))), new AggregatorFactory[] { new DoubleSumAggregatorFactory("met1sum", "met1"), new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null), null), new KafkaIndexTaskIOConfig(0, "sequence0", new SeekableStreamStartSequenceNumbers<>(topic, ImmutableMap.of(0, 0L), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>(topic, ImmutableMap.of(0, 5L)), kafkaServer.consumerProperties(), KafkaSupervisorIOConfig.DEFAULT_POLL_TIMEOUT_MILLIS, true, null, null, new TestKafkaInputFormat(INPUT_FORMAT)));
Assert.assertTrue(task.supportsQueries());
final ListenableFuture<TaskStatus> future = runTask(task);
while (countEvents(task) != 3) {
Thread.sleep(25);
}
Assert.assertEquals(Status.READING, task.getRunner().getStatus());
final QuerySegmentSpec interval = OBJECT_MAPPER.readValue("\"2008/2012\"", QuerySegmentSpec.class);
List<ScanResultValue> scanResultValues = scanData(task, interval);
// verify that there are no records indexed in the rollbacked time period
Assert.assertEquals(3, Iterables.size(scanResultValues));
int i = 0;
for (ScanResultValue result : scanResultValues) {
final Map<String, Object> event = ((List<Map<String, Object>>) result.getEvents()).get(0);
Assert.assertEquals((long) i++, event.get("kafka.offset"));
Assert.assertEquals(topic, event.get("kafka.topic"));
Assert.assertEquals("application/json", event.get("kafka.header.encoding"));
}
// insert remaining data
insertData(Iterables.skip(records, 3));
// Wait for task to exit
Assert.assertEquals(TaskState.SUCCESS, future.get().getStatusCode());
// Check metrics
Assert.assertEquals(4, task.getRunner().getRowIngestionMeters().getProcessed());
Assert.assertEquals(0, task.getRunner().getRowIngestionMeters().getUnparseable());
Assert.assertEquals(0, task.getRunner().getRowIngestionMeters().getThrownAway());
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class JobHelperTest method setup.
@Before
public void setup() throws Exception {
tmpDir = temporaryFolder.newFile();
dataFile = temporaryFolder.newFile();
config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema("website", HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "visited_num"), false, 0), null), Map.class), new AggregatorFactory[] { new LongSumAggregatorFactory("visited_num", "visited_num") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(this.interval)), null, HadoopDruidIndexerConfig.JSON_MAPPER), new HadoopIOConfig(ImmutableMap.of("paths", dataFile.getCanonicalPath(), "type", "static"), null, tmpDir.getCanonicalPath()), new HadoopTuningConfig(tmpDir.getCanonicalPath(), null, null, null, null, null, null, null, null, false, false, false, false, // Map of job properties
ImmutableMap.of("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem", "fs.s3.awsAccessKeyId", "THISISMYACCESSKEY"), false, false, null, null, false, false, null, null, null, null, null)));
HadoopDruidIndexerConfig.PROPERTIES.setProperty(VALID_DRUID_PROP, "true");
HadoopDruidIndexerConfig.PROPERTIES.setProperty(VALID_HADOOP_PREFIX + VALID_HADOOP_PROP, "true");
HadoopDruidIndexerConfig.PROPERTIES.setProperty(INVALID_PROP, "true");
}
Aggregations