Examples with DatasourceIngestionSpec - org.apache.druid.indexer.hadoop.DatasourceIngestionSpec

Example 6 with DatasourceIngestionSpec

use of org.apache.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.

the class MaterializedViewSupervisorSpec method createTask.

public HadoopIndexTask createTask(Interval interval, String version, List<DataSegment> segments) {
    String taskId = StringUtils.format("%s_%s_%s", TASK_PREFIX, dataSourceName, DateTimes.nowUtc());
    // generate parser
    Map<String, Object> parseSpec = new HashMap<>();
    parseSpec.put("format", "timeAndDims");
    parseSpec.put("dimensionsSpec", dimensionsSpec);
    Map<String, Object> parser = new HashMap<>();
    parser.put("type", "map");
    parser.put("parseSpec", parseSpec);
    // generate HadoopTuningConfig
    HadoopTuningConfig tuningConfigForTask = new HadoopTuningConfig(tuningConfig.getWorkingPath(), version, tuningConfig.getPartitionsSpec(), tuningConfig.getShardSpecs(), tuningConfig.getIndexSpec(), tuningConfig.getIndexSpecForIntermediatePersists(), tuningConfig.getAppendableIndexSpec(), tuningConfig.getMaxRowsInMemory(), tuningConfig.getMaxBytesInMemory(), tuningConfig.isLeaveIntermediate(), tuningConfig.isCleanupOnFailure(), tuningConfig.isOverwriteFiles(), tuningConfig.isIgnoreInvalidRows(), tuningConfig.getJobProperties(), tuningConfig.isCombineText(), tuningConfig.getUseCombiner(), tuningConfig.getMaxRowsInMemory(), tuningConfig.getNumBackgroundPersistThreads(), tuningConfig.isForceExtendableShardSpecs(), true, tuningConfig.getUserAllowedHadoopPrefix(), tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.isUseYarnRMJobStatusFallback(), tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis());
    // generate granularity
    ArbitraryGranularitySpec granularitySpec = new ArbitraryGranularitySpec(Granularities.NONE, ImmutableList.of(interval));
    // generate DataSchema
    DataSchema dataSchema = new DataSchema(dataSourceName, parser, aggregators, granularitySpec, TransformSpec.NONE, objectMapper);
    // generate DatasourceIngestionSpec
    DatasourceIngestionSpec datasourceIngestionSpec = new DatasourceIngestionSpec(baseDataSource, null, ImmutableList.of(interval), segments, null, null, null, false, null);
    // generate HadoopIOConfig
    Map<String, Object> inputSpec = new HashMap<>();
    inputSpec.put("type", "dataSource");
    inputSpec.put("ingestionSpec", datasourceIngestionSpec);
    HadoopIOConfig hadoopIOConfig = new HadoopIOConfig(inputSpec, null, null);
    // generate HadoopIngestionSpec
    HadoopIngestionSpec spec = new HadoopIngestionSpec(dataSchema, hadoopIOConfig, tuningConfigForTask);
    // generate HadoopIndexTask
    HadoopIndexTask task = new HadoopIndexTask(taskId, spec, hadoopCoordinates, hadoopDependencyCoordinates, classpathPrefix, objectMapper, context, authorizerMapper, chatHandlerProvider);
    return task;
}

Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) DatasourceIngestionSpec(org.apache.druid.indexer.hadoop.DatasourceIngestionSpec) HadoopIngestionSpec(org.apache.druid.indexer.HadoopIngestionSpec) HashMap(java.util.HashMap) HadoopTuningConfig(org.apache.druid.indexer.HadoopTuningConfig) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) HadoopIndexTask(org.apache.druid.indexing.common.task.HadoopIndexTask) HadoopIOConfig(org.apache.druid.indexer.HadoopIOConfig)

Example 7 with DatasourceIngestionSpec

use of org.apache.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.

the class HadoopIngestionSpec method updateSegmentListIfDatasourcePathSpecIsUsed.

public static void updateSegmentListIfDatasourcePathSpecIsUsed(HadoopIngestionSpec spec, ObjectMapper jsonMapper, UsedSegmentsRetriever segmentsRetriever) throws IOException {
    String dataSource = "dataSource";
    String type = "type";
    String multi = "multi";
    String children = "children";
    String segments = "segments";
    String ingestionSpec = "ingestionSpec";
    Map<String, Object> pathSpec = spec.getIOConfig().getPathSpec();
    List<Map<String, Object>> datasourcePathSpecs = new ArrayList<>();
    if (pathSpec.get(type).equals(dataSource)) {
        datasourcePathSpecs.add(pathSpec);
    } else if (pathSpec.get(type).equals(multi)) {
        List<Map<String, Object>> childPathSpecs = (List<Map<String, Object>>) pathSpec.get(children);
        for (Map<String, Object> childPathSpec : childPathSpecs) {
            if (childPathSpec.get(type).equals(dataSource)) {
                datasourcePathSpecs.add(childPathSpec);
            }
        }
    }
    for (Map<String, Object> datasourcePathSpec : datasourcePathSpecs) {
        Map<String, Object> ingestionSpecMap = (Map<String, Object>) datasourcePathSpec.get(ingestionSpec);
        DatasourceIngestionSpec ingestionSpecObj = jsonMapper.convertValue(ingestionSpecMap, DatasourceIngestionSpec.class);
        Collection<DataSegment> usedVisibleSegments = segmentsRetriever.retrieveUsedSegmentsForIntervals(ingestionSpecObj.getDataSource(), ingestionSpecObj.getIntervals(), Segments.ONLY_VISIBLE);
        if (ingestionSpecObj.getSegments() != null) {
            // ensure that user supplied segment list matches with the usedVisibleSegments obtained from db
            // this safety check lets users do test-n-set kind of batch delta ingestion where the delta
            // ingestion task would only run if current state of the system is same as when they submitted
            // the task.
            List<DataSegment> userSuppliedSegmentsList = ingestionSpecObj.getSegments();
            if (usedVisibleSegments.size() == userSuppliedSegmentsList.size()) {
                Set<DataSegment> segmentsSet = new HashSet<>(usedVisibleSegments);
                for (DataSegment userSegment : userSuppliedSegmentsList) {
                    if (!segmentsSet.contains(userSegment)) {
                        throw new IOException("user supplied segments list did not match with segments list obtained from db");
                    }
                }
            } else {
                throw new IOException("user supplied segments list did not match with segments list obtained from db");
            }
        }
        final VersionedIntervalTimeline<String, DataSegment> timeline = VersionedIntervalTimeline.forSegments(usedVisibleSegments);
        final List<WindowedDataSegment> windowedSegments = new ArrayList<>();
        for (Interval interval : ingestionSpecObj.getIntervals()) {
            final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = timeline.lookup(interval);
            for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) {
                for (PartitionChunk<DataSegment> chunk : holder.getObject()) {
                    windowedSegments.add(new WindowedDataSegment(chunk.getObject(), holder.getInterval()));
                }
            }
            datasourcePathSpec.put(segments, windowedSegments);
        }
    }
}

Also used : DatasourceIngestionSpec(org.apache.druid.indexer.hadoop.DatasourceIngestionSpec) ArrayList(java.util.ArrayList) IOException(java.io.IOException) DataSegment(org.apache.druid.timeline.DataSegment) WindowedDataSegment(org.apache.druid.indexer.hadoop.WindowedDataSegment) WindowedDataSegment(org.apache.druid.indexer.hadoop.WindowedDataSegment) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) HashSet(java.util.HashSet) Interval(org.joda.time.Interval)

Example 8 with DatasourceIngestionSpec

use of org.apache.druid.indexer.hadoop.DatasourceIngestionSpec in project druid by druid-io.

the class DatasourcePathSpec method addInputPaths.

@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
    if (segments == null || segments.isEmpty()) {
        if (ingestionSpec.isIgnoreWhenNoSegments()) {
            logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
            return job;
        } else {
            throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
        }
    }
    logger.info("Found total [%d] segments for [%s]  in interval [%s]", segments.size(), ingestionSpec.getDataSource(), ingestionSpec.getIntervals());
    DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
    if (updatedIngestionSpec.getDimensions() == null) {
        List<String> dims;
        if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
            dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
        } else {
            Set<String> dimSet = Sets.newHashSet(Iterables.concat(Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() {

                @Override
                public Iterable<String> apply(WindowedDataSegment dataSegment) {
                    return dataSegment.getSegment().getDimensions();
                }
            })));
            dims = Lists.newArrayList(Sets.difference(dimSet, config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions()));
        }
        updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
    }
    if (updatedIngestionSpec.getMetrics() == null) {
        Set<String> metrics = new HashSet<>();
        final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
        if (cols != null) {
            if (useNewAggs) {
                for (AggregatorFactory col : cols) {
                    metrics.addAll(col.requiredFields());
                }
            } else {
                for (AggregatorFactory col : cols) {
                    metrics.add(col.getName());
                }
            }
        }
        updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
    }
    updatedIngestionSpec = updatedIngestionSpec.withQueryGranularity(config.getGranularitySpec().getQueryGranularity());
    // propagate in the transformSpec from the overall job config
    updatedIngestionSpec = updatedIngestionSpec.withTransformSpec(config.getSchema().getDataSchema().getTransformSpec());
    DatasourceInputFormat.addDataSource(job.getConfiguration(), updatedIngestionSpec, segments, maxSplitSize);
    MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
    return job;
}

Also used : DatasourceIngestionSpec(org.apache.druid.indexer.hadoop.DatasourceIngestionSpec) Path(org.apache.hadoop.fs.Path) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) WindowedDataSegment(org.apache.druid.indexer.hadoop.WindowedDataSegment) ISE(org.apache.druid.java.util.common.ISE) HashSet(java.util.HashSet)

Aggregations

DatasourceIngestionSpec (org.apache.druid.indexer.hadoop.DatasourceIngestionSpec)8 DatasourcePathSpec (org.apache.druid.indexer.path.DatasourcePathSpec)5 MultiplePathSpec (org.apache.druid.indexer.path.MultiplePathSpec)5 PathSpec (org.apache.druid.indexer.path.PathSpec)5 StaticPathSpec (org.apache.druid.indexer.path.StaticPathSpec)5 Test (org.junit.Test)5 WindowedDataSegment (org.apache.druid.indexer.hadoop.WindowedDataSegment)4 HashSet (java.util.HashSet)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 HadoopIOConfig (org.apache.druid.indexer.HadoopIOConfig)1 HadoopIngestionSpec (org.apache.druid.indexer.HadoopIngestionSpec)1 HadoopTuningConfig (org.apache.druid.indexer.HadoopTuningConfig)1 HadoopIndexTask (org.apache.druid.indexing.common.task.HadoopIndexTask)1 ISE (org.apache.druid.java.util.common.ISE)1 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)1 DataSchema (org.apache.druid.segment.indexing.DataSchema)1