use of io.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.
the class HadoopIngestionSpec method updateSegmentListIfDatasourcePathSpecIsUsed.
public static HadoopIngestionSpec updateSegmentListIfDatasourcePathSpecIsUsed(HadoopIngestionSpec spec, ObjectMapper jsonMapper, UsedSegmentLister segmentLister) throws IOException {
String dataSource = "dataSource";
String type = "type";
String multi = "multi";
String children = "children";
String segments = "segments";
String ingestionSpec = "ingestionSpec";
Map<String, Object> pathSpec = spec.getIOConfig().getPathSpec();
Map<String, Object> datasourcePathSpec = null;
if (pathSpec.get(type).equals(dataSource)) {
datasourcePathSpec = pathSpec;
} else if (pathSpec.get(type).equals(multi)) {
List<Map<String, Object>> childPathSpecs = (List<Map<String, Object>>) pathSpec.get(children);
for (Map<String, Object> childPathSpec : childPathSpecs) {
if (childPathSpec.get(type).equals(dataSource)) {
datasourcePathSpec = childPathSpec;
break;
}
}
}
if (datasourcePathSpec != null) {
Map<String, Object> ingestionSpecMap = (Map<String, Object>) datasourcePathSpec.get(ingestionSpec);
DatasourceIngestionSpec ingestionSpecObj = jsonMapper.convertValue(ingestionSpecMap, DatasourceIngestionSpec.class);
List<DataSegment> segmentsList = segmentLister.getUsedSegmentsForIntervals(ingestionSpecObj.getDataSource(), ingestionSpecObj.getIntervals());
if (ingestionSpecObj.getSegments() != null) {
//ensure that user supplied segment list matches with the segmentsList obtained from db
//this safety check lets users do test-n-set kind of batch delta ingestion where the delta
//ingestion task would only run if current state of the system is same as when they submitted
//the task.
List<DataSegment> userSuppliedSegmentsList = ingestionSpecObj.getSegments();
if (segmentsList.size() == userSuppliedSegmentsList.size()) {
Set<DataSegment> segmentsSet = new HashSet<>(segmentsList);
for (DataSegment userSegment : userSuppliedSegmentsList) {
if (!segmentsSet.contains(userSegment)) {
throw new IOException("user supplied segments list did not match with segments list obtained from db");
}
}
} else {
throw new IOException("user supplied segments list did not match with segments list obtained from db");
}
}
VersionedIntervalTimeline<String, DataSegment> timeline = new VersionedIntervalTimeline<>(Ordering.natural());
for (DataSegment segment : segmentsList) {
timeline.add(segment.getInterval(), segment.getVersion(), segment.getShardSpec().createChunk(segment));
}
final List<WindowedDataSegment> windowedSegments = Lists.newArrayList();
for (Interval interval : ingestionSpecObj.getIntervals()) {
final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = timeline.lookup(interval);
for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) {
for (PartitionChunk<DataSegment> chunk : holder.getObject()) {
windowedSegments.add(new WindowedDataSegment(chunk.getObject(), holder.getInterval()));
}
}
datasourcePathSpec.put(segments, windowedSegments);
}
}
return spec;
}
use of io.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.
the class HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest method testupdateSegmentListIfDatasourcePathSpecIsUsedWithMultiplePathSpec.
@Test
public void testupdateSegmentListIfDatasourcePathSpecIsUsedWithMultiplePathSpec() throws Exception {
PathSpec pathSpec = new MultiplePathSpec(ImmutableList.of(new StaticPathSpec("/xyz", null), new DatasourcePathSpec(jsonMapper, null, new DatasourceIngestionSpec(testDatasource, testDatasourceInterval, null, null, null, null, null, null, false), null)));
HadoopDruidIndexerConfig config = testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, testDatasourceInterval);
Assert.assertEquals(ImmutableList.of(WindowedDataSegment.of(SEGMENT)), ((DatasourcePathSpec) ((MultiplePathSpec) config.getPathSpec()).getChildren().get(1)).getSegments());
}
use of io.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.
the class HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest method testupdateSegmentListIfDatasourcePathSpecIsUsedWithJustDatasourcePathSpecAndPartialInterval.
@Test
public void testupdateSegmentListIfDatasourcePathSpecIsUsedWithJustDatasourcePathSpecAndPartialInterval() throws Exception {
PathSpec pathSpec = new DatasourcePathSpec(jsonMapper, null, new DatasourceIngestionSpec(testDatasource, testDatasourceIntervalPartial, null, null, null, null, null, null, false), null);
HadoopDruidIndexerConfig config = testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, testDatasourceIntervalPartial);
Assert.assertEquals(ImmutableList.of(new WindowedDataSegment(SEGMENT, testDatasourceIntervalPartial)), ((DatasourcePathSpec) config.getPathSpec()).getSegments());
}
use of io.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.
the class HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest method testupdateSegmentListIfDatasourcePathSpecWithMatchingUserSegments.
@Test
public void testupdateSegmentListIfDatasourcePathSpecWithMatchingUserSegments() throws Exception {
PathSpec pathSpec = new DatasourcePathSpec(jsonMapper, null, new DatasourceIngestionSpec(testDatasource, testDatasourceInterval, null, ImmutableList.<DataSegment>of(SEGMENT), null, null, null, null, false), null);
HadoopDruidIndexerConfig config = testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, testDatasourceInterval);
Assert.assertEquals(ImmutableList.of(WindowedDataSegment.of(SEGMENT)), ((DatasourcePathSpec) config.getPathSpec()).getSegments());
}
use of io.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.
the class DatasourcePathSpec method addInputPaths.
@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
if (segments == null || segments.isEmpty()) {
if (ingestionSpec.isIgnoreWhenNoSegments()) {
logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
return job;
} else {
throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
}
}
logger.info("Found total [%d] segments for [%s] in interval [%s]", segments.size(), ingestionSpec.getDataSource(), ingestionSpec.getIntervals());
DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
if (updatedIngestionSpec.getDimensions() == null) {
List<String> dims;
if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
} else {
Set<String> dimSet = Sets.newHashSet(Iterables.concat(Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() {
@Override
public Iterable<String> apply(WindowedDataSegment dataSegment) {
return dataSegment.getSegment().getDimensions();
}
})));
dims = Lists.newArrayList(Sets.difference(dimSet, config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions()));
}
updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
}
if (updatedIngestionSpec.getMetrics() == null) {
Set<String> metrics = Sets.newHashSet();
final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
if (cols != null) {
for (AggregatorFactory col : cols) {
metrics.add(col.getName());
}
}
updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
}
updatedIngestionSpec = updatedIngestionSpec.withQueryGranularity(config.getGranularitySpec().getQueryGranularity());
job.getConfiguration().set(DatasourceInputFormat.CONF_DRUID_SCHEMA, mapper.writeValueAsString(updatedIngestionSpec));
job.getConfiguration().set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, mapper.writeValueAsString(segments));
job.getConfiguration().set(DatasourceInputFormat.CONF_MAX_SPLIT_SIZE, String.valueOf(maxSplitSize));
MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
return job;
}
Aggregations