use of io.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.
the class HadoopIngestionSpec method updateSegmentListIfDatasourcePathSpecIsUsed.
public static HadoopIngestionSpec updateSegmentListIfDatasourcePathSpecIsUsed(HadoopIngestionSpec spec, ObjectMapper jsonMapper, UsedSegmentLister segmentLister) throws IOException {
String dataSource = "dataSource";
String type = "type";
String multi = "multi";
String children = "children";
String segments = "segments";
String ingestionSpec = "ingestionSpec";
Map<String, Object> pathSpec = spec.getIOConfig().getPathSpec();
Map<String, Object> datasourcePathSpec = null;
if (pathSpec.get(type).equals(dataSource)) {
datasourcePathSpec = pathSpec;
} else if (pathSpec.get(type).equals(multi)) {
List<Map<String, Object>> childPathSpecs = (List<Map<String, Object>>) pathSpec.get(children);
for (Map<String, Object> childPathSpec : childPathSpecs) {
if (childPathSpec.get(type).equals(dataSource)) {
datasourcePathSpec = childPathSpec;
break;
}
}
}
if (datasourcePathSpec != null) {
Map<String, Object> ingestionSpecMap = (Map<String, Object>) datasourcePathSpec.get(ingestionSpec);
DatasourceIngestionSpec ingestionSpecObj = jsonMapper.convertValue(ingestionSpecMap, DatasourceIngestionSpec.class);
List<DataSegment> segmentsList = segmentLister.getUsedSegmentsForIntervals(ingestionSpecObj.getDataSource(), ingestionSpecObj.getIntervals());
if (ingestionSpecObj.getSegments() != null) {
//ensure that user supplied segment list matches with the segmentsList obtained from db
//this safety check lets users do test-n-set kind of batch delta ingestion where the delta
//ingestion task would only run if current state of the system is same as when they submitted
//the task.
List<DataSegment> userSuppliedSegmentsList = ingestionSpecObj.getSegments();
if (segmentsList.size() == userSuppliedSegmentsList.size()) {
Set<DataSegment> segmentsSet = new HashSet<>(segmentsList);
for (DataSegment userSegment : userSuppliedSegmentsList) {
if (!segmentsSet.contains(userSegment)) {
throw new IOException("user supplied segments list did not match with segments list obtained from db");
}
}
} else {
throw new IOException("user supplied segments list did not match with segments list obtained from db");
}
}
VersionedIntervalTimeline<String, DataSegment> timeline = new VersionedIntervalTimeline<>(Ordering.natural());
for (DataSegment segment : segmentsList) {
timeline.add(segment.getInterval(), segment.getVersion(), segment.getShardSpec().createChunk(segment));
}
final List<WindowedDataSegment> windowedSegments = Lists.newArrayList();
for (Interval interval : ingestionSpecObj.getIntervals()) {
final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = timeline.lookup(interval);
for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) {
for (PartitionChunk<DataSegment> chunk : holder.getObject()) {
windowedSegments.add(new WindowedDataSegment(chunk.getObject(), holder.getInterval()));
}
}
datasourcePathSpec.put(segments, windowedSegments);
}
}
return spec;
}
use of io.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.
the class HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest method testupdateSegmentListIfDatasourcePathSpecIsUsedWithJustDatasourcePathSpecAndPartialInterval.
@Test
public void testupdateSegmentListIfDatasourcePathSpecIsUsedWithJustDatasourcePathSpecAndPartialInterval() throws Exception {
PathSpec pathSpec = new DatasourcePathSpec(jsonMapper, null, new DatasourceIngestionSpec(testDatasource, testDatasourceIntervalPartial, null, null, null, null, null, null, false), null);
HadoopDruidIndexerConfig config = testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, testDatasourceIntervalPartial);
Assert.assertEquals(ImmutableList.of(new WindowedDataSegment(SEGMENT, testDatasourceIntervalPartial)), ((DatasourcePathSpec) config.getPathSpec()).getSegments());
}
use of io.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.
the class DatasourcePathSpec method addInputPaths.
@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
if (segments == null || segments.isEmpty()) {
if (ingestionSpec.isIgnoreWhenNoSegments()) {
logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
return job;
} else {
throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
}
}
logger.info("Found total [%d] segments for [%s] in interval [%s]", segments.size(), ingestionSpec.getDataSource(), ingestionSpec.getIntervals());
DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
if (updatedIngestionSpec.getDimensions() == null) {
List<String> dims;
if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
} else {
Set<String> dimSet = Sets.newHashSet(Iterables.concat(Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() {
@Override
public Iterable<String> apply(WindowedDataSegment dataSegment) {
return dataSegment.getSegment().getDimensions();
}
})));
dims = Lists.newArrayList(Sets.difference(dimSet, config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions()));
}
updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
}
if (updatedIngestionSpec.getMetrics() == null) {
Set<String> metrics = Sets.newHashSet();
final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
if (cols != null) {
for (AggregatorFactory col : cols) {
metrics.add(col.getName());
}
}
updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
}
updatedIngestionSpec = updatedIngestionSpec.withQueryGranularity(config.getGranularitySpec().getQueryGranularity());
job.getConfiguration().set(DatasourceInputFormat.CONF_DRUID_SCHEMA, mapper.writeValueAsString(updatedIngestionSpec));
job.getConfiguration().set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, mapper.writeValueAsString(segments));
job.getConfiguration().set(DatasourceInputFormat.CONF_MAX_SPLIT_SIZE, String.valueOf(maxSplitSize));
MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
return job;
}
use of io.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.
the class BatchDeltaIngestionTest method testReindexing.
@Test
public void testReindexing() throws Exception {
List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));
HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(ImmutableMap.<String, Object>of("type", "dataSource", "ingestionSpec", ImmutableMap.of("dataSource", "xyz", "interval", INTERVAL_FULL), "segments", segments), temporaryFolder.newFolder());
List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T00:00:00.000Z"), "host", ImmutableList.of("a.example.com"), "visited_sum", 100L, "unique_hosts", 1.0d), ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T01:00:00.000Z"), "host", ImmutableList.of("b.example.com"), "visited_sum", 150L, "unique_hosts", 1.0d), ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T02:00:00.000Z"), "host", ImmutableList.of("c.example.com"), "visited_sum", 200L, "unique_hosts", 1.0d));
testIngestion(config, expectedRows, Iterables.getOnlyElement(segments));
}
use of io.druid.indexer.hadoop.WindowedDataSegment in project druid by druid-io.
the class BatchDeltaIngestionTest method testDeltaIngestion.
@Test
public void testDeltaIngestion() throws Exception {
File tmpDir = temporaryFolder.newFolder();
File dataFile1 = new File(tmpDir, "data1");
FileUtils.writeLines(dataFile1, ImmutableList.of("2014102200,a.example.com,a.example.com,90", "2014102201,b.example.com,b.example.com,25"));
File dataFile2 = new File(tmpDir, "data2");
FileUtils.writeLines(dataFile2, ImmutableList.of("2014102202,c.example.com,c.example.com,70"));
//using a hadoop glob path to test that it continues to work with hadoop MultipleInputs usage and not
//affected by
//https://issues.apache.org/jira/browse/MAPREDUCE-5061
String inputPath = tmpDir.getPath() + "/{data1,data2}";
List<WindowedDataSegment> segments = ImmutableList.of(new WindowedDataSegment(SEGMENT, INTERVAL_FULL));
HadoopDruidIndexerConfig config = makeHadoopDruidIndexerConfig(ImmutableMap.<String, Object>of("type", "multi", "children", ImmutableList.of(ImmutableMap.<String, Object>of("type", "dataSource", "ingestionSpec", ImmutableMap.of("dataSource", "xyz", "interval", INTERVAL_FULL), "segments", segments), ImmutableMap.<String, Object>of("type", "static", "paths", inputPath))), temporaryFolder.newFolder());
List<ImmutableMap<String, Object>> expectedRows = ImmutableList.of(ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T00:00:00.000Z"), "host", ImmutableList.of("a.example.com"), "visited_sum", 190L, "unique_hosts", 1.0d), ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T01:00:00.000Z"), "host", ImmutableList.of("b.example.com"), "visited_sum", 175L, "unique_hosts", 1.0d), ImmutableMap.<String, Object>of("time", DateTime.parse("2014-10-22T02:00:00.000Z"), "host", ImmutableList.of("c.example.com"), "visited_sum", 270L, "unique_hosts", 1.0d));
testIngestion(config, expectedRows, Iterables.getOnlyElement(segments));
}
Aggregations