Search in sources :

Example 6 with ShardSpec

use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class IndexTaskTest method runTask.

private final List<DataSegment> runTask(final IndexTask indexTask) throws Exception {
    final List<DataSegment> segments = Lists.newArrayList();
    indexTask.run(new TaskToolbox(null, null, new TaskActionClient() {

        @Override
        public <RetType> RetType submit(TaskAction<RetType> taskAction) throws IOException {
            if (taskAction instanceof LockListAction) {
                return (RetType) Arrays.asList(new TaskLock("", "", null, new DateTime().toString()));
            }
            if (taskAction instanceof LockAcquireAction) {
                return (RetType) new TaskLock("groupId", "test", ((LockAcquireAction) taskAction).getInterval(), new DateTime().toString());
            }
            if (taskAction instanceof SegmentTransactionalInsertAction) {
                return (RetType) new SegmentPublishResult(((SegmentTransactionalInsertAction) taskAction).getSegments(), true);
            }
            if (taskAction instanceof SegmentAllocateAction) {
                SegmentAllocateAction action = (SegmentAllocateAction) taskAction;
                Interval interval = action.getPreferredSegmentGranularity().bucket(action.getTimestamp());
                ShardSpec shardSpec = new NumberedShardSpec(segmentAllocatePartitionCounter++, 0);
                return (RetType) new SegmentIdentifier(action.getDataSource(), interval, "latestVersion", shardSpec);
            }
            return null;
        }
    }, null, new DataSegmentPusher() {

        @Deprecated
        @Override
        public String getPathForHadoop(String dataSource) {
            return getPathForHadoop();
        }

        @Override
        public String getPathForHadoop() {
            return null;
        }

        @Override
        public DataSegment push(File file, DataSegment segment) throws IOException {
            segments.add(segment);
            return segment;
        }
    }, null, null, null, null, null, null, null, null, null, jsonMapper, temporaryFolder.newFolder(), indexMerger, indexIO, null, null, indexMergerV9));
    Collections.sort(segments);
    return segments;
}
Also used : LockListAction(io.druid.indexing.common.actions.LockListAction) DataSegmentPusher(io.druid.segment.loading.DataSegmentPusher) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) TaskAction(io.druid.indexing.common.actions.TaskAction) SegmentTransactionalInsertAction(io.druid.indexing.common.actions.SegmentTransactionalInsertAction) DataSegment(io.druid.timeline.DataSegment) DateTime(org.joda.time.DateTime) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) TaskToolbox(io.druid.indexing.common.TaskToolbox) SegmentPublishResult(io.druid.indexing.overlord.SegmentPublishResult) TaskActionClient(io.druid.indexing.common.actions.TaskActionClient) TaskLock(io.druid.indexing.common.TaskLock) SegmentAllocateAction(io.druid.indexing.common.actions.SegmentAllocateAction) LockAcquireAction(io.druid.indexing.common.actions.LockAcquireAction) File(java.io.File) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Interval(org.joda.time.Interval)

Example 7 with ShardSpec

use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class DeterminePartitionsJob method run.

public boolean run() {
    try {
        if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
            throw new ISE("DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec());
        }
        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));
            JobHelper.injectSystemProperties(groupByJob);
            config.addJobProperties(groupByJob);
            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }
        /*
       * Read grouped data and determine appropriate partitions.
       */
        final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));
        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");
        JobHelper.injectSystemProperties(dimSelectionJob);
        config.addJobProperties(dimSelectionJob);
        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            config.addInputPaths(dimSelectionJob);
        }
        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
        dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);
        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());
        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());
        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }
        /*
       * Load partitions determined by the previous job.
       */
        log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<Long, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
                List<ShardSpec> specs = config.JSON_MAPPER.readValue(Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() {
                });
                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
                }
                shardSpecs.put(segmentGranularity.getStartMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SingleDimensionPartitionsSpec(io.druid.indexer.partitions.SingleDimensionPartitionsSpec) SingleDimensionShardSpec(io.druid.timeline.partition.SingleDimensionShardSpec) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) InvalidJobConfException(org.apache.hadoop.mapred.InvalidJobConfException) IOException(java.io.IOException) FileSystem(org.apache.hadoop.fs.FileSystem) ISE(io.druid.java.util.common.ISE) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) Interval(org.joda.time.Interval)

Example 8 with ShardSpec

use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class SchemalessIndexTest method makeAppendedMMappedIndex.

private static QueryableIndex makeAppendedMMappedIndex(Iterable<Pair<String, AggregatorFactory[]>> files, final List<Interval> intervals) {
    try {
        File tmpFile = File.createTempFile("yay", "boo");
        tmpFile.delete();
        File mergedFile = new File(tmpFile, "merged");
        mergedFile.mkdirs();
        mergedFile.deleteOnExit();
        List<File> filesToMap = makeFilesToMap(tmpFile, files);
        VersionedIntervalTimeline<Integer, File> timeline = new VersionedIntervalTimeline<Integer, File>(Ordering.natural().nullsFirst());
        ShardSpec noneShardSpec = NoneShardSpec.instance();
        for (int i = 0; i < intervals.size(); i++) {
            timeline.add(intervals.get(i), i, noneShardSpec.createChunk(filesToMap.get(i)));
        }
        final List<IndexableAdapter> adapters = Lists.newArrayList(Iterables.concat(// TimelineObjectHolder is actually an iterable of iterable of indexable adapters
        Iterables.transform(timeline.lookup(new Interval("1000-01-01/3000-01-01")), new Function<TimelineObjectHolder<Integer, File>, Iterable<IndexableAdapter>>() {

            @Override
            public Iterable<IndexableAdapter> apply(final TimelineObjectHolder<Integer, File> timelineObjectHolder) {
                return Iterables.transform(timelineObjectHolder.getObject(), // Each chunk can be used to build the actual IndexableAdapter
                new Function<PartitionChunk<File>, IndexableAdapter>() {

                    @Override
                    public IndexableAdapter apply(PartitionChunk<File> chunk) {
                        try {
                            return new RowboatFilteringIndexAdapter(new QueryableIndexIndexableAdapter(INDEX_IO.loadIndex(chunk.getObject())), new Predicate<Rowboat>() {

                                @Override
                                public boolean apply(Rowboat input) {
                                    return timelineObjectHolder.getInterval().contains(input.getTimestamp());
                                }
                            });
                        } catch (IOException e) {
                            throw Throwables.propagate(e);
                        }
                    }
                });
            }
        })));
        return INDEX_IO.loadIndex(INDEX_MERGER.append(adapters, null, mergedFile, indexSpec));
    } catch (IOException e) {
        throw Throwables.propagate(e);
    }
}
Also used : IOException(java.io.IOException) ShardSpec(io.druid.timeline.partition.ShardSpec) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) Function(com.google.common.base.Function) TimelineObjectHolder(io.druid.timeline.TimelineObjectHolder) VersionedIntervalTimeline(io.druid.timeline.VersionedIntervalTimeline) PartitionChunk(io.druid.timeline.partition.PartitionChunk) File(java.io.File) Interval(org.joda.time.Interval)

Example 9 with ShardSpec

use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class IndexTask method generateAndPublishSegments.

private boolean generateAndPublishSegments(final TaskToolbox toolbox, final DataSchema dataSchema, final Map<Interval, List<ShardSpec>> shardSpecs, final String version, final FirehoseFactory firehoseFactory) throws IOException, InterruptedException {
    final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null, null), null);
    final FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
    final Map<String, ShardSpec> sequenceNameToShardSpecMap = Maps.newHashMap();
    if (toolbox.getMonitorScheduler() != null) {
        toolbox.getMonitorScheduler().addMonitor(new RealtimeMetricsMonitor(ImmutableList.of(fireDepartmentForMetrics), ImmutableMap.of(DruidMetrics.TASK_ID, new String[] { getId() })));
    }
    final SegmentAllocator segmentAllocator;
    if (ingestionSchema.getIOConfig().isAppendToExisting()) {
        segmentAllocator = new ActionBasedSegmentAllocator(toolbox.getTaskActionClient(), dataSchema);
    } else {
        segmentAllocator = new SegmentAllocator() {

            @Override
            public SegmentIdentifier allocate(DateTime timestamp, String sequenceName, String previousSegmentId) throws IOException {
                Optional<Interval> interval = granularitySpec.bucketInterval(timestamp);
                if (!interval.isPresent()) {
                    throw new ISE("Could not find interval for timestamp [%s]", timestamp);
                }
                ShardSpec shardSpec = sequenceNameToShardSpecMap.get(sequenceName);
                if (shardSpec == null) {
                    throw new ISE("Could not find ShardSpec for sequenceName [%s]", sequenceName);
                }
                return new SegmentIdentifier(getDataSource(), interval.get(), version, shardSpec);
            }
        };
    }
    try (final Appenderator appenderator = newAppenderator(fireDepartmentMetrics, toolbox, dataSchema);
        final FiniteAppenderatorDriver driver = newDriver(appenderator, toolbox, segmentAllocator, fireDepartmentMetrics);
        final Firehose firehose = firehoseFactory.connect(dataSchema.getParser())) {
        final Supplier<Committer> committerSupplier = Committers.supplierFromFirehose(firehose);
        final Map<Interval, ShardSpecLookup> shardSpecLookups = Maps.newHashMap();
        if (driver.startJob() != null) {
            driver.clear();
        }
        try {
            while (firehose.hasMore()) {
                try {
                    final InputRow inputRow = firehose.nextRow();
                    final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
                    if (!optInterval.isPresent()) {
                        fireDepartmentMetrics.incrementThrownAway();
                        continue;
                    }
                    final Interval interval = optInterval.get();
                    if (!shardSpecLookups.containsKey(interval)) {
                        final List<ShardSpec> intervalShardSpecs = shardSpecs.get(interval);
                        if (intervalShardSpecs == null || intervalShardSpecs.isEmpty()) {
                            throw new ISE("Failed to get shardSpec for interval[%s]", interval);
                        }
                        shardSpecLookups.put(interval, intervalShardSpecs.get(0).getLookup(intervalShardSpecs));
                    }
                    final ShardSpec shardSpec = shardSpecLookups.get(interval).getShardSpec(inputRow.getTimestampFromEpoch(), inputRow);
                    final String sequenceName = String.format("index_%s_%s_%d", interval, version, shardSpec.getPartitionNum());
                    if (!sequenceNameToShardSpecMap.containsKey(sequenceName)) {
                        final ShardSpec shardSpecForPublishing = ingestionSchema.getTuningConfig().isForceExtendableShardSpecs() || ingestionSchema.getIOConfig().isAppendToExisting() ? new NumberedShardSpec(shardSpec.getPartitionNum(), shardSpecs.get(interval).size()) : shardSpec;
                        sequenceNameToShardSpecMap.put(sequenceName, shardSpecForPublishing);
                    }
                    final SegmentIdentifier identifier = driver.add(inputRow, sequenceName, committerSupplier);
                    if (identifier == null) {
                        throw new ISE("Could not allocate segment for row with timestamp[%s]", inputRow.getTimestamp());
                    }
                    fireDepartmentMetrics.incrementProcessed();
                } catch (ParseException e) {
                    if (ingestionSchema.getTuningConfig().isReportParseExceptions()) {
                        throw e;
                    } else {
                        fireDepartmentMetrics.incrementUnparseable();
                    }
                }
            }
        } finally {
            driver.persist(committerSupplier.get());
        }
        final TransactionalSegmentPublisher publisher = new TransactionalSegmentPublisher() {

            @Override
            public boolean publishSegments(Set<DataSegment> segments, Object commitMetadata) throws IOException {
                final SegmentTransactionalInsertAction action = new SegmentTransactionalInsertAction(segments, null, null);
                return toolbox.getTaskActionClient().submit(action).isSuccess();
            }
        };
        final SegmentsAndMetadata published = driver.finish(publisher, committerSupplier.get());
        if (published == null) {
            log.error("Failed to publish segments, aborting!");
            return false;
        } else {
            log.info("Published segments[%s]", Joiner.on(", ").join(Iterables.transform(published.getSegments(), new Function<DataSegment, String>() {

                @Override
                public String apply(DataSegment input) {
                    return input.getIdentifier();
                }
            })));
            return true;
        }
    }
}
Also used : RealtimeIOConfig(io.druid.segment.indexing.RealtimeIOConfig) SortedSet(java.util.SortedSet) Set(java.util.Set) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) ShardSpecLookup(io.druid.timeline.partition.ShardSpecLookup) SegmentTransactionalInsertAction(io.druid.indexing.common.actions.SegmentTransactionalInsertAction) DataSegment(io.druid.timeline.DataSegment) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) DateTime(org.joda.time.DateTime) FireDepartment(io.druid.segment.realtime.FireDepartment) TransactionalSegmentPublisher(io.druid.segment.realtime.appenderator.TransactionalSegmentPublisher) ActionBasedSegmentAllocator(io.druid.indexing.appenderator.ActionBasedSegmentAllocator) ISE(io.druid.java.util.common.ISE) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Optional(com.google.common.base.Optional) Firehose(io.druid.data.input.Firehose) SegmentsAndMetadata(io.druid.segment.realtime.appenderator.SegmentsAndMetadata) IOException(java.io.IOException) FireDepartmentMetrics(io.druid.segment.realtime.FireDepartmentMetrics) Appenderator(io.druid.segment.realtime.appenderator.Appenderator) GranularitySpec(io.druid.segment.indexing.granularity.GranularitySpec) ActionBasedSegmentAllocator(io.druid.indexing.appenderator.ActionBasedSegmentAllocator) SegmentAllocator(io.druid.segment.realtime.appenderator.SegmentAllocator) FiniteAppenderatorDriver(io.druid.segment.realtime.appenderator.FiniteAppenderatorDriver) InputRow(io.druid.data.input.InputRow) RealtimeMetricsMonitor(io.druid.segment.realtime.RealtimeMetricsMonitor) Committer(io.druid.data.input.Committer) ParseException(io.druid.java.util.common.parsers.ParseException) Interval(org.joda.time.Interval)

Example 10 with ShardSpec

use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class IndexTask method determineShardSpecs.

/**
   * Determines the number of shards for each interval using a hash of queryGranularity timestamp + all dimensions (i.e
   * hash-based partitioning). In the future we may want to also support single-dimension partitioning.
   */
private Map<Interval, List<ShardSpec>> determineShardSpecs(final TaskToolbox toolbox, final FirehoseFactory firehoseFactory) throws IOException {
    final ObjectMapper jsonMapper = toolbox.getObjectMapper();
    final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
    final Granularity queryGranularity = granularitySpec.getQueryGranularity();
    final boolean determineNumPartitions = ingestionSchema.getTuningConfig().getNumShards() == null;
    final boolean determineIntervals = !ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
    final Map<Interval, List<ShardSpec>> shardSpecs = Maps.newHashMap();
    // if we were given number of shards per interval and the intervals, we don't need to scan the data
    if (!determineNumPartitions && !determineIntervals) {
        log.info("numShards and intervals provided, skipping determine partition scan");
        final SortedSet<Interval> intervals = ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().get();
        final int numShards = ingestionSchema.getTuningConfig().getNumShards();
        for (Interval interval : intervals) {
            final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
            if (numShards > 1) {
                for (int i = 0; i < numShards; i++) {
                    intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
                }
            } else {
                intervalShardSpecs.add(NoneShardSpec.instance());
            }
            shardSpecs.put(interval, intervalShardSpecs);
        }
        return shardSpecs;
    }
    // determine intervals containing data and prime HLL collectors
    final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = Maps.newHashMap();
    int thrownAway = 0;
    log.info("Determining intervals and shardSpecs");
    long determineShardSpecsStartMillis = System.currentTimeMillis();
    try (final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
        while (firehose.hasMore()) {
            final InputRow inputRow = firehose.nextRow();
            final Interval interval;
            if (determineIntervals) {
                interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
            } else {
                final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
                if (!optInterval.isPresent()) {
                    thrownAway++;
                    continue;
                }
                interval = optInterval.get();
            }
            if (!determineNumPartitions) {
                // for the interval and don't instantiate a HLL collector
                if (!hllCollectors.containsKey(interval)) {
                    hllCollectors.put(interval, Optional.<HyperLogLogCollector>absent());
                }
                continue;
            }
            if (!hllCollectors.containsKey(interval)) {
                hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
            }
            List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
            hllCollectors.get(interval).get().add(hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
        }
    }
    if (thrownAway > 0) {
        log.warn("Unable to to find a matching interval for [%,d] events", thrownAway);
    }
    final ImmutableSortedMap<Interval, Optional<HyperLogLogCollector>> sortedMap = ImmutableSortedMap.copyOf(hllCollectors, Comparators.intervalsByStartThenEnd());
    for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : sortedMap.entrySet()) {
        final Interval interval = entry.getKey();
        final Optional<HyperLogLogCollector> collector = entry.getValue();
        final int numShards;
        if (determineNumPartitions) {
            final long numRows = new Double(collector.get().estimateCardinality()).longValue();
            numShards = (int) Math.ceil((double) numRows / ingestionSchema.getTuningConfig().getTargetPartitionSize());
            log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numShards);
        } else {
            numShards = ingestionSchema.getTuningConfig().getNumShards();
            log.info("Creating [%,d] shards for interval [%s]", numShards, interval);
        }
        final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
        if (numShards > 1) {
            for (int i = 0; i < numShards; i++) {
                intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
            }
        } else {
            intervalShardSpecs.add(NoneShardSpec.instance());
        }
        shardSpecs.put(interval, intervalShardSpecs);
    }
    log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
    return shardSpecs;
}
Also used : Granularity(io.druid.java.util.common.granularity.Granularity) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Optional(com.google.common.base.Optional) Firehose(io.druid.data.input.Firehose) HyperLogLogCollector(io.druid.hll.HyperLogLogCollector) GranularitySpec(io.druid.segment.indexing.granularity.GranularitySpec) InputRow(io.druid.data.input.InputRow) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableSortedMap(com.google.common.collect.ImmutableSortedMap) Interval(org.joda.time.Interval)

Aggregations

ShardSpec (io.druid.timeline.partition.ShardSpec)17 Interval (org.joda.time.Interval)9 NumberedShardSpec (io.druid.timeline.partition.NumberedShardSpec)7 NoneShardSpec (io.druid.timeline.partition.NoneShardSpec)6 HashBasedNumberedShardSpec (io.druid.timeline.partition.HashBasedNumberedShardSpec)5 List (java.util.List)5 Optional (com.google.common.base.Optional)4 IOException (java.io.IOException)4 Map (java.util.Map)4 Test (org.junit.Test)4 ImmutableList (com.google.common.collect.ImmutableList)3 ImmutableMap (com.google.common.collect.ImmutableMap)3 DataSegment (io.druid.timeline.DataSegment)3 PartitionChunk (io.druid.timeline.partition.PartitionChunk)3 SingleDimensionShardSpec (io.druid.timeline.partition.SingleDimensionShardSpec)3 DateTime (org.joda.time.DateTime)3 Function (com.google.common.base.Function)2 QueryableDruidServer (io.druid.client.selector.QueryableDruidServer)2 ServerSelector (io.druid.client.selector.ServerSelector)2 Firehose (io.druid.data.input.Firehose)2