use of org.apache.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.
the class MaterializedViewSupervisorSpec method createTask.
public HadoopIndexTask createTask(Interval interval, String version, List<DataSegment> segments) {
String taskId = StringUtils.format("%s_%s_%s", TASK_PREFIX, dataSourceName, DateTimes.nowUtc());
// generate parser
Map<String, Object> parseSpec = new HashMap<>();
parseSpec.put("format", "timeAndDims");
parseSpec.put("dimensionsSpec", dimensionsSpec);
Map<String, Object> parser = new HashMap<>();
parser.put("type", "map");
parser.put("parseSpec", parseSpec);
// generate HadoopTuningConfig
HadoopTuningConfig tuningConfigForTask = new HadoopTuningConfig(tuningConfig.getWorkingPath(), version, tuningConfig.getPartitionsSpec(), tuningConfig.getShardSpecs(), tuningConfig.getIndexSpec(), tuningConfig.getIndexSpecForIntermediatePersists(), tuningConfig.getAppendableIndexSpec(), tuningConfig.getMaxRowsInMemory(), tuningConfig.getMaxBytesInMemory(), tuningConfig.isLeaveIntermediate(), tuningConfig.isCleanupOnFailure(), tuningConfig.isOverwriteFiles(), tuningConfig.isIgnoreInvalidRows(), tuningConfig.getJobProperties(), tuningConfig.isCombineText(), tuningConfig.getUseCombiner(), tuningConfig.getMaxRowsInMemory(), tuningConfig.getNumBackgroundPersistThreads(), tuningConfig.isForceExtendableShardSpecs(), true, tuningConfig.getUserAllowedHadoopPrefix(), tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.isUseYarnRMJobStatusFallback(), tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis());
// generate granularity
ArbitraryGranularitySpec granularitySpec = new ArbitraryGranularitySpec(Granularities.NONE, ImmutableList.of(interval));
// generate DataSchema
DataSchema dataSchema = new DataSchema(dataSourceName, parser, aggregators, granularitySpec, TransformSpec.NONE, objectMapper);
// generate DatasourceIngestionSpec
DatasourceIngestionSpec datasourceIngestionSpec = new DatasourceIngestionSpec(baseDataSource, null, ImmutableList.of(interval), segments, null, null, null, false, null);
// generate HadoopIOConfig
Map<String, Object> inputSpec = new HashMap<>();
inputSpec.put("type", "dataSource");
inputSpec.put("ingestionSpec", datasourceIngestionSpec);
HadoopIOConfig hadoopIOConfig = new HadoopIOConfig(inputSpec, null, null);
// generate HadoopIngestionSpec
HadoopIngestionSpec spec = new HadoopIngestionSpec(dataSchema, hadoopIOConfig, tuningConfigForTask);
// generate HadoopIndexTask
HadoopIndexTask task = new HadoopIndexTask(taskId, spec, hadoopCoordinates, hadoopDependencyCoordinates, classpathPrefix, objectMapper, context, authorizerMapper, chatHandlerProvider);
return task;
}
use of org.apache.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.
the class HadoopIngestionSpec method updateSegmentListIfDatasourcePathSpecIsUsed.
public static void updateSegmentListIfDatasourcePathSpecIsUsed(HadoopIngestionSpec spec, ObjectMapper jsonMapper, UsedSegmentsRetriever segmentsRetriever) throws IOException {
String dataSource = "dataSource";
String type = "type";
String multi = "multi";
String children = "children";
String segments = "segments";
String ingestionSpec = "ingestionSpec";
Map<String, Object> pathSpec = spec.getIOConfig().getPathSpec();
List<Map<String, Object>> datasourcePathSpecs = new ArrayList<>();
if (pathSpec.get(type).equals(dataSource)) {
datasourcePathSpecs.add(pathSpec);
} else if (pathSpec.get(type).equals(multi)) {
List<Map<String, Object>> childPathSpecs = (List<Map<String, Object>>) pathSpec.get(children);
for (Map<String, Object> childPathSpec : childPathSpecs) {
if (childPathSpec.get(type).equals(dataSource)) {
datasourcePathSpecs.add(childPathSpec);
}
}
}
for (Map<String, Object> datasourcePathSpec : datasourcePathSpecs) {
Map<String, Object> ingestionSpecMap = (Map<String, Object>) datasourcePathSpec.get(ingestionSpec);
DatasourceIngestionSpec ingestionSpecObj = jsonMapper.convertValue(ingestionSpecMap, DatasourceIngestionSpec.class);
Collection<DataSegment> usedVisibleSegments = segmentsRetriever.retrieveUsedSegmentsForIntervals(ingestionSpecObj.getDataSource(), ingestionSpecObj.getIntervals(), Segments.ONLY_VISIBLE);
if (ingestionSpecObj.getSegments() != null) {
// ensure that user supplied segment list matches with the usedVisibleSegments obtained from db
// this safety check lets users do test-n-set kind of batch delta ingestion where the delta
// ingestion task would only run if current state of the system is same as when they submitted
// the task.
List<DataSegment> userSuppliedSegmentsList = ingestionSpecObj.getSegments();
if (usedVisibleSegments.size() == userSuppliedSegmentsList.size()) {
Set<DataSegment> segmentsSet = new HashSet<>(usedVisibleSegments);
for (DataSegment userSegment : userSuppliedSegmentsList) {
if (!segmentsSet.contains(userSegment)) {
throw new IOException("user supplied segments list did not match with segments list obtained from db");
}
}
} else {
throw new IOException("user supplied segments list did not match with segments list obtained from db");
}
}
final VersionedIntervalTimeline<String, DataSegment> timeline = VersionedIntervalTimeline.forSegments(usedVisibleSegments);
final List<WindowedDataSegment> windowedSegments = new ArrayList<>();
for (Interval interval : ingestionSpecObj.getIntervals()) {
final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = timeline.lookup(interval);
for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) {
for (PartitionChunk<DataSegment> chunk : holder.getObject()) {
windowedSegments.add(new WindowedDataSegment(chunk.getObject(), holder.getInterval()));
}
}
datasourcePathSpec.put(segments, windowedSegments);
}
}
}
use of org.apache.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.
the class DatasourcePathSpec method addInputPaths.
@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
if (segments == null || segments.isEmpty()) {
if (ingestionSpec.isIgnoreWhenNoSegments()) {
logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
return job;
} else {
throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
}
}
logger.info("Found total [%d] segments for [%s] in interval [%s]", segments.size(), ingestionSpec.getDataSource(), ingestionSpec.getIntervals());
DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
if (updatedIngestionSpec.getDimensions() == null) {
List<String> dims;
if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
} else {
Set<String> dimSet = Sets.newHashSet(Iterables.concat(Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() {
@Override
public Iterable<String> apply(WindowedDataSegment dataSegment) {
return dataSegment.getSegment().getDimensions();
}
})));
dims = Lists.newArrayList(Sets.difference(dimSet, config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions()));
}
updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
}
if (updatedIngestionSpec.getMetrics() == null) {
Set<String> metrics = new HashSet<>();
final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
if (cols != null) {
if (useNewAggs) {
for (AggregatorFactory col : cols) {
metrics.addAll(col.requiredFields());
}
} else {
for (AggregatorFactory col : cols) {
metrics.add(col.getName());
}
}
}
updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
}
updatedIngestionSpec = updatedIngestionSpec.withQueryGranularity(config.getGranularitySpec().getQueryGranularity());
// propagate in the transformSpec from the overall job config
updatedIngestionSpec = updatedIngestionSpec.withTransformSpec(config.getSchema().getDataSchema().getTransformSpec());
DatasourceInputFormat.addDataSource(job.getConfiguration(), updatedIngestionSpec, segments, maxSplitSize);
MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
return job;
}
Aggregations