use of org.apache.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.
the class InputSourceSamplerTest method testWithTransformsDimensionsSpec.
@Test
public void testWithTransformsDimensionsSpec() throws IOException {
final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(ImmutableList.of(StringDimensionSchema.create("dim1PlusBar")));
final TransformSpec transformSpec = new TransformSpec(null, ImmutableList.of(new ExpressionTransform("dim1PlusBar", "concat(dim1 + 'bar')", TestExprMacroTable.INSTANCE)));
final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, true, null);
final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, transformSpec);
final InputSource inputSource = createInputSource(getTestRows(), dataSchema);
final InputFormat inputFormat = createInputFormat();
SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
Assert.assertEquals(6, response.getNumRowsRead());
Assert.assertEquals(5, response.getNumRowsIndexed());
Assert.assertEquals(3, response.getData().size());
List<SamplerResponseRow> data = response.getData();
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foobar").put("met1", 11L).build(), null, null), data.get(0));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foo2bar").put("met1", 4L).build(), null, null), data.get(1));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(2));
}
use of org.apache.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.
the class InputSourceSamplerTest method testIndexParseException.
@Test
public void testIndexParseException() throws IOException {
final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(ImmutableList.of(StringDimensionSchema.create("dim1PlusBar")));
final TransformSpec transformSpec = new TransformSpec(null, ImmutableList.of(new ExpressionTransform("dim1PlusBar", "concat(dim1 + 'bar')", TestExprMacroTable.INSTANCE)));
final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, true, null);
final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, transformSpec);
//
// add a invalid row to cause parse exception when indexing
//
Map<String, Object> rawColumns4ParseExceptionRow = ImmutableMap.of("t", "2019-04-22T12:00", "dim1", "foo2", "met1", "invalidNumber");
final List<String> inputTestRows = Lists.newArrayList(getTestRows());
inputTestRows.add(ParserType.STR_CSV.equals(parserType) ? "2019-04-22T12:00,foo2,,invalidNumber" : OBJECT_MAPPER.writeValueAsString(rawColumns4ParseExceptionRow));
final InputSource inputSource = createInputSource(inputTestRows, dataSchema);
final InputFormat inputFormat = createInputFormat();
SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
Assert.assertEquals(7, response.getNumRowsRead());
Assert.assertEquals(5, response.getNumRowsIndexed());
Assert.assertEquals(4, response.getData().size());
List<SamplerResponseRow> data = response.getData();
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foobar").put("met1", 11L).build(), null, null), data.get(0));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foo2bar").put("met1", 4L).build(), null, null), data.get(1));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(2));
//
// the last row has parse exception when indexing, check if rawColumns and exception message match the expected
//
String indexParseExceptioMessage = ParserType.STR_CSV.equals(parserType) ? "Found unparseable columns in row: [SamplerInputRow{row=TransformedInputRow{row=MapBasedInputRow{timestamp=2019-04-22T12:00:00.000Z, event={t=2019-04-22T12:00, dim1=foo2, dim2=null, met1=invalidNumber}, dimensions=[dim1PlusBar]}}}], exceptions: [Unable to parse value[invalidNumber] for field[met1]]" : "Found unparseable columns in row: [SamplerInputRow{row=TransformedInputRow{row=MapBasedInputRow{timestamp=2019-04-22T12:00:00.000Z, event={t=2019-04-22T12:00, dim1=foo2, met1=invalidNumber}, dimensions=[dim1PlusBar]}}}], exceptions: [Unable to parse value[invalidNumber] for field[met1]]";
assertEqualsSamplerResponseRow(new SamplerResponseRow(rawColumns4ParseExceptionRow, null, true, indexParseExceptioMessage), data.get(3));
}
use of org.apache.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.
the class KafkaIndexTaskTest method testKafkaRecordEntityInputFormat.
@Test(timeout = 60_000L)
public void testKafkaRecordEntityInputFormat() throws Exception {
// Insert data
insertData(Iterables.limit(records, 3));
final KafkaIndexTask task = createTask(null, new DataSchema("test_ds", new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(Arrays.asList(new StringDimensionSchema("dim1"), new StringDimensionSchema("dim1t"), new StringDimensionSchema("dim2"), new LongDimensionSchema("dimLong"), new FloatDimensionSchema("dimFloat"), new StringDimensionSchema("kafka.topic"), new LongDimensionSchema("kafka.offset"), new StringDimensionSchema("kafka.header.encoding"))), new AggregatorFactory[] { new DoubleSumAggregatorFactory("met1sum", "met1"), new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, null), null), new KafkaIndexTaskIOConfig(0, "sequence0", new SeekableStreamStartSequenceNumbers<>(topic, ImmutableMap.of(0, 0L), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>(topic, ImmutableMap.of(0, 5L)), kafkaServer.consumerProperties(), KafkaSupervisorIOConfig.DEFAULT_POLL_TIMEOUT_MILLIS, true, null, null, new TestKafkaInputFormat(INPUT_FORMAT)));
Assert.assertTrue(task.supportsQueries());
final ListenableFuture<TaskStatus> future = runTask(task);
while (countEvents(task) != 3) {
Thread.sleep(25);
}
Assert.assertEquals(Status.READING, task.getRunner().getStatus());
final QuerySegmentSpec interval = OBJECT_MAPPER.readValue("\"2008/2012\"", QuerySegmentSpec.class);
List<ScanResultValue> scanResultValues = scanData(task, interval);
// verify that there are no records indexed in the rollbacked time period
Assert.assertEquals(3, Iterables.size(scanResultValues));
int i = 0;
for (ScanResultValue result : scanResultValues) {
final Map<String, Object> event = ((List<Map<String, Object>>) result.getEvents()).get(0);
Assert.assertEquals((long) i++, event.get("kafka.offset"));
Assert.assertEquals(topic, event.get("kafka.topic"));
Assert.assertEquals("application/json", event.get("kafka.header.encoding"));
}
// insert remaining data
insertData(Iterables.skip(records, 3));
// Wait for task to exit
Assert.assertEquals(TaskState.SUCCESS, future.get().getStatusCode());
// Check metrics
Assert.assertEquals(4, task.getRunner().getRowIngestionMeters().getProcessed());
Assert.assertEquals(0, task.getRunner().getRowIngestionMeters().getUnparseable());
Assert.assertEquals(0, task.getRunner().getRowIngestionMeters().getThrownAway());
}
use of org.apache.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.
the class DetermineHashedPartitionsJob method run.
@Override
public boolean run() {
try {
/*
* Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
* in the final segment.
*/
startTime = System.currentTimeMillis();
groupByJob = Job.getInstance(new Configuration(), StringUtils.format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals()));
JobHelper.injectSystemProperties(groupByJob.getConfiguration(), config);
config.addJobProperties(groupByJob);
groupByJob.setMapperClass(DetermineCardinalityMapper.class);
groupByJob.setMapOutputKeyClass(LongWritable.class);
groupByJob.setMapOutputValueClass(BytesWritable.class);
groupByJob.setReducerClass(DetermineCardinalityReducer.class);
groupByJob.setOutputKeyClass(NullWritable.class);
groupByJob.setOutputValueClass(NullWritable.class);
groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
if (config.getInputIntervals().isEmpty()) {
groupByJob.setNumReduceTasks(1);
} else {
groupByJob.setNumReduceTasks(Iterators.size(config.getSegmentGranularIntervals().iterator()));
}
JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
config.addInputPaths(groupByJob);
config.intoConfiguration(groupByJob);
FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
groupByJob.submit();
log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
// Store the jobId in the file
if (groupByJob.getJobID() != null) {
JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
}
try {
if (!groupByJob.waitForCompletion(true)) {
log.error("Job failed: %s", groupByJob.getJobID());
failureCause = Utils.getFailureMessage(groupByJob, HadoopDruidIndexerConfig.JSON_MAPPER);
return false;
}
} catch (IOException ioe) {
if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob, config.isUseYarnRMJobStatusFallback())) {
throw ioe;
}
}
/*
* Load partitions and intervals determined by the previous job.
*/
log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
FileSystem fileSystem = null;
if (config.getInputIntervals().isEmpty()) {
final Path intervalInfoPath = config.makeIntervalInfoPath();
fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
}
List<Interval> intervals = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() {
});
config.setGranularitySpec(new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(), config.getGranularitySpec().getQueryGranularity(), config.getGranularitySpec().isRollup(), intervals));
log.info("Determined Intervals for Job [%s].", config.getSegmentGranularIntervals());
}
Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>(DateTimeComparator.getInstance());
PartitionsSpec partitionsSpec = config.getPartitionsSpec();
if (!(partitionsSpec instanceof HashedPartitionsSpec)) {
throw new ISE("%s is expected, but got %s", HashedPartitionsSpec.class.getName(), partitionsSpec.getClass().getName());
}
HashPartitionFunction partitionFunction = ((HashedPartitionsSpec) partitionsSpec).getPartitionFunction();
int shardCount = 0;
for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
DateTime bucket = segmentGranularity.getStart();
final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
if (fileSystem == null) {
fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
}
if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
final Long numRows = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, partitionInfoPath), Long.class);
log.info("Found approximately [%,d] rows in data.", numRows);
final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());
log.info("Creating [%,d] shards", numberOfShards);
List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
for (int i = 0; i < numberOfShards; ++i) {
actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards, i, numberOfShards, null, partitionFunction, HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
}
shardSpecs.put(bucket.getMillis(), actualSpecs);
} else {
log.info("Path[%s] didn't exist!?", partitionInfoPath);
}
}
config.setShardSpecs(shardSpecs);
log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime));
return true;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.
the class HadoopIngestionSpecTest method testGranularitySpec.
@Test
public void testGranularitySpec() {
final HadoopIngestionSpec schema;
try {
schema = jsonReadWriteRead("{\n" + " \"dataSchema\": {\n" + " \"dataSource\": \"foo\",\n" + " \"metricsSpec\": [],\n" + " \"granularitySpec\": {\n" + " \"type\": \"uniform\",\n" + " \"segmentGranularity\": \"hour\",\n" + " \"intervals\": [\"2012-01-01/P1D\"]\n" + " }\n" + " }\n" + "}", HadoopIngestionSpec.class);
} catch (Exception e) {
throw new RuntimeException(e);
}
final UniformGranularitySpec granularitySpec = (UniformGranularitySpec) schema.getDataSchema().getGranularitySpec();
Assert.assertEquals("getIntervals", Collections.singletonList(Intervals.of("2012-01-01/P1D")), granularitySpec.inputIntervals());
Assert.assertEquals("getSegmentGranularity", Granularities.HOUR, granularitySpec.getSegmentGranularity());
}
Aggregations