Search in sources :

Example 36 with InputRow

use of io.druid.data.input.InputRow in project druid by druid-io.

the class IndexMergerTest method persistAndLoad.

private QueryableIndex persistAndLoad(List<DimensionSchema> schema, InputRow... rows) throws IOException {
    IncrementalIndex toPersist = IncrementalIndexTest.createIndex(null, new DimensionsSpec(schema, null, null));
    for (InputRow row : rows) {
        toPersist.add(row);
    }
    final File tempDir = temporaryFolder.newFolder();
    return closer.closeLater(INDEX_IO.loadIndex(INDEX_MERGER.persist(toPersist, tempDir, indexSpec)));
}
Also used : IncrementalIndex(io.druid.segment.incremental.IncrementalIndex) OnheapIncrementalIndex(io.druid.segment.incremental.OnheapIncrementalIndex) MapBasedInputRow(io.druid.data.input.MapBasedInputRow) InputRow(io.druid.data.input.InputRow) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) File(java.io.File)

Example 37 with InputRow

use of io.druid.data.input.InputRow in project druid by druid-io.

the class IndexMergerV9CompatibilityTest method setUp.

@Before
public void setUp() throws IOException {
    toPersist = new OnheapIncrementalIndex(JodaUtils.MIN_INSTANT, Granularities.NONE, DEFAULT_AGG_FACTORIES, 1000000);
    toPersist.getMetadata().put("key", "value");
    for (InputRow event : events) {
        toPersist.add(event);
    }
    tmpDir = Files.createTempDir();
    persistTmpDir = new File(tmpDir, "persistDir");
    INDEX_MERGER.persist(toPersist, persistTmpDir, INDEX_SPEC);
}
Also used : OnheapIncrementalIndex(io.druid.segment.incremental.OnheapIncrementalIndex) MapBasedInputRow(io.druid.data.input.MapBasedInputRow) InputRow(io.druid.data.input.InputRow) File(java.io.File) Before(org.junit.Before)

Example 38 with InputRow

use of io.druid.data.input.InputRow in project druid by druid-io.

the class IndexTask method generateAndPublishSegments.

private boolean generateAndPublishSegments(final TaskToolbox toolbox, final DataSchema dataSchema, final Map<Interval, List<ShardSpec>> shardSpecs, final String version, final FirehoseFactory firehoseFactory) throws IOException, InterruptedException {
    final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null, null), null);
    final FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
    final Map<String, ShardSpec> sequenceNameToShardSpecMap = Maps.newHashMap();
    if (toolbox.getMonitorScheduler() != null) {
        toolbox.getMonitorScheduler().addMonitor(new RealtimeMetricsMonitor(ImmutableList.of(fireDepartmentForMetrics), ImmutableMap.of(DruidMetrics.TASK_ID, new String[] { getId() })));
    }
    final SegmentAllocator segmentAllocator;
    if (ingestionSchema.getIOConfig().isAppendToExisting()) {
        segmentAllocator = new ActionBasedSegmentAllocator(toolbox.getTaskActionClient(), dataSchema);
    } else {
        segmentAllocator = new SegmentAllocator() {

            @Override
            public SegmentIdentifier allocate(DateTime timestamp, String sequenceName, String previousSegmentId) throws IOException {
                Optional<Interval> interval = granularitySpec.bucketInterval(timestamp);
                if (!interval.isPresent()) {
                    throw new ISE("Could not find interval for timestamp [%s]", timestamp);
                }
                ShardSpec shardSpec = sequenceNameToShardSpecMap.get(sequenceName);
                if (shardSpec == null) {
                    throw new ISE("Could not find ShardSpec for sequenceName [%s]", sequenceName);
                }
                return new SegmentIdentifier(getDataSource(), interval.get(), version, shardSpec);
            }
        };
    }
    try (final Appenderator appenderator = newAppenderator(fireDepartmentMetrics, toolbox, dataSchema);
        final FiniteAppenderatorDriver driver = newDriver(appenderator, toolbox, segmentAllocator, fireDepartmentMetrics);
        final Firehose firehose = firehoseFactory.connect(dataSchema.getParser())) {
        final Supplier<Committer> committerSupplier = Committers.supplierFromFirehose(firehose);
        final Map<Interval, ShardSpecLookup> shardSpecLookups = Maps.newHashMap();
        if (driver.startJob() != null) {
            driver.clear();
        }
        try {
            while (firehose.hasMore()) {
                try {
                    final InputRow inputRow = firehose.nextRow();
                    final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
                    if (!optInterval.isPresent()) {
                        fireDepartmentMetrics.incrementThrownAway();
                        continue;
                    }
                    final Interval interval = optInterval.get();
                    if (!shardSpecLookups.containsKey(interval)) {
                        final List<ShardSpec> intervalShardSpecs = shardSpecs.get(interval);
                        if (intervalShardSpecs == null || intervalShardSpecs.isEmpty()) {
                            throw new ISE("Failed to get shardSpec for interval[%s]", interval);
                        }
                        shardSpecLookups.put(interval, intervalShardSpecs.get(0).getLookup(intervalShardSpecs));
                    }
                    final ShardSpec shardSpec = shardSpecLookups.get(interval).getShardSpec(inputRow.getTimestampFromEpoch(), inputRow);
                    final String sequenceName = String.format("index_%s_%s_%d", interval, version, shardSpec.getPartitionNum());
                    if (!sequenceNameToShardSpecMap.containsKey(sequenceName)) {
                        final ShardSpec shardSpecForPublishing = ingestionSchema.getTuningConfig().isForceExtendableShardSpecs() || ingestionSchema.getIOConfig().isAppendToExisting() ? new NumberedShardSpec(shardSpec.getPartitionNum(), shardSpecs.get(interval).size()) : shardSpec;
                        sequenceNameToShardSpecMap.put(sequenceName, shardSpecForPublishing);
                    }
                    final SegmentIdentifier identifier = driver.add(inputRow, sequenceName, committerSupplier);
                    if (identifier == null) {
                        throw new ISE("Could not allocate segment for row with timestamp[%s]", inputRow.getTimestamp());
                    }
                    fireDepartmentMetrics.incrementProcessed();
                } catch (ParseException e) {
                    if (ingestionSchema.getTuningConfig().isReportParseExceptions()) {
                        throw e;
                    } else {
                        fireDepartmentMetrics.incrementUnparseable();
                    }
                }
            }
        } finally {
            driver.persist(committerSupplier.get());
        }
        final TransactionalSegmentPublisher publisher = new TransactionalSegmentPublisher() {

            @Override
            public boolean publishSegments(Set<DataSegment> segments, Object commitMetadata) throws IOException {
                final SegmentTransactionalInsertAction action = new SegmentTransactionalInsertAction(segments, null, null);
                return toolbox.getTaskActionClient().submit(action).isSuccess();
            }
        };
        final SegmentsAndMetadata published = driver.finish(publisher, committerSupplier.get());
        if (published == null) {
            log.error("Failed to publish segments, aborting!");
            return false;
        } else {
            log.info("Published segments[%s]", Joiner.on(", ").join(Iterables.transform(published.getSegments(), new Function<DataSegment, String>() {

                @Override
                public String apply(DataSegment input) {
                    return input.getIdentifier();
                }
            })));
            return true;
        }
    }
}
Also used : RealtimeIOConfig(io.druid.segment.indexing.RealtimeIOConfig) SortedSet(java.util.SortedSet) Set(java.util.Set) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) ShardSpecLookup(io.druid.timeline.partition.ShardSpecLookup) SegmentTransactionalInsertAction(io.druid.indexing.common.actions.SegmentTransactionalInsertAction) DataSegment(io.druid.timeline.DataSegment) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) DateTime(org.joda.time.DateTime) FireDepartment(io.druid.segment.realtime.FireDepartment) TransactionalSegmentPublisher(io.druid.segment.realtime.appenderator.TransactionalSegmentPublisher) ActionBasedSegmentAllocator(io.druid.indexing.appenderator.ActionBasedSegmentAllocator) ISE(io.druid.java.util.common.ISE) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Optional(com.google.common.base.Optional) Firehose(io.druid.data.input.Firehose) SegmentsAndMetadata(io.druid.segment.realtime.appenderator.SegmentsAndMetadata) IOException(java.io.IOException) FireDepartmentMetrics(io.druid.segment.realtime.FireDepartmentMetrics) Appenderator(io.druid.segment.realtime.appenderator.Appenderator) GranularitySpec(io.druid.segment.indexing.granularity.GranularitySpec) ActionBasedSegmentAllocator(io.druid.indexing.appenderator.ActionBasedSegmentAllocator) SegmentAllocator(io.druid.segment.realtime.appenderator.SegmentAllocator) FiniteAppenderatorDriver(io.druid.segment.realtime.appenderator.FiniteAppenderatorDriver) InputRow(io.druid.data.input.InputRow) RealtimeMetricsMonitor(io.druid.segment.realtime.RealtimeMetricsMonitor) Committer(io.druid.data.input.Committer) ParseException(io.druid.java.util.common.parsers.ParseException) Interval(org.joda.time.Interval)

Example 39 with InputRow

use of io.druid.data.input.InputRow in project druid by druid-io.

the class IndexTask method determineShardSpecs.

/**
   * Determines the number of shards for each interval using a hash of queryGranularity timestamp + all dimensions (i.e
   * hash-based partitioning). In the future we may want to also support single-dimension partitioning.
   */
private Map<Interval, List<ShardSpec>> determineShardSpecs(final TaskToolbox toolbox, final FirehoseFactory firehoseFactory) throws IOException {
    final ObjectMapper jsonMapper = toolbox.getObjectMapper();
    final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
    final Granularity queryGranularity = granularitySpec.getQueryGranularity();
    final boolean determineNumPartitions = ingestionSchema.getTuningConfig().getNumShards() == null;
    final boolean determineIntervals = !ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
    final Map<Interval, List<ShardSpec>> shardSpecs = Maps.newHashMap();
    // if we were given number of shards per interval and the intervals, we don't need to scan the data
    if (!determineNumPartitions && !determineIntervals) {
        log.info("numShards and intervals provided, skipping determine partition scan");
        final SortedSet<Interval> intervals = ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().get();
        final int numShards = ingestionSchema.getTuningConfig().getNumShards();
        for (Interval interval : intervals) {
            final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
            if (numShards > 1) {
                for (int i = 0; i < numShards; i++) {
                    intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
                }
            } else {
                intervalShardSpecs.add(NoneShardSpec.instance());
            }
            shardSpecs.put(interval, intervalShardSpecs);
        }
        return shardSpecs;
    }
    // determine intervals containing data and prime HLL collectors
    final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = Maps.newHashMap();
    int thrownAway = 0;
    log.info("Determining intervals and shardSpecs");
    long determineShardSpecsStartMillis = System.currentTimeMillis();
    try (final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
        while (firehose.hasMore()) {
            final InputRow inputRow = firehose.nextRow();
            final Interval interval;
            if (determineIntervals) {
                interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
            } else {
                final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
                if (!optInterval.isPresent()) {
                    thrownAway++;
                    continue;
                }
                interval = optInterval.get();
            }
            if (!determineNumPartitions) {
                // for the interval and don't instantiate a HLL collector
                if (!hllCollectors.containsKey(interval)) {
                    hllCollectors.put(interval, Optional.<HyperLogLogCollector>absent());
                }
                continue;
            }
            if (!hllCollectors.containsKey(interval)) {
                hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
            }
            List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
            hllCollectors.get(interval).get().add(hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
        }
    }
    if (thrownAway > 0) {
        log.warn("Unable to to find a matching interval for [%,d] events", thrownAway);
    }
    final ImmutableSortedMap<Interval, Optional<HyperLogLogCollector>> sortedMap = ImmutableSortedMap.copyOf(hllCollectors, Comparators.intervalsByStartThenEnd());
    for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : sortedMap.entrySet()) {
        final Interval interval = entry.getKey();
        final Optional<HyperLogLogCollector> collector = entry.getValue();
        final int numShards;
        if (determineNumPartitions) {
            final long numRows = new Double(collector.get().estimateCardinality()).longValue();
            numShards = (int) Math.ceil((double) numRows / ingestionSchema.getTuningConfig().getTargetPartitionSize());
            log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numShards);
        } else {
            numShards = ingestionSchema.getTuningConfig().getNumShards();
            log.info("Creating [%,d] shards for interval [%s]", numShards, interval);
        }
        final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
        if (numShards > 1) {
            for (int i = 0; i < numShards; i++) {
                intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
            }
        } else {
            intervalShardSpecs.add(NoneShardSpec.instance());
        }
        shardSpecs.put(interval, intervalShardSpecs);
    }
    log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
    return shardSpecs;
}
Also used : Granularity(io.druid.java.util.common.granularity.Granularity) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Optional(com.google.common.base.Optional) Firehose(io.druid.data.input.Firehose) HyperLogLogCollector(io.druid.hll.HyperLogLogCollector) GranularitySpec(io.druid.segment.indexing.granularity.GranularitySpec) InputRow(io.druid.data.input.InputRow) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableSortedMap(com.google.common.collect.ImmutableSortedMap) Interval(org.joda.time.Interval)

Example 40 with InputRow

use of io.druid.data.input.InputRow in project druid by druid-io.

the class YeOldePlumberSchool method findPlumber.

@Override
public Plumber findPlumber(final DataSchema schema, final RealtimeTuningConfig config, final FireDepartmentMetrics metrics) {
    // There can be only one.
    final Sink theSink = new Sink(interval, schema, config.getShardSpec(), version, config.getMaxRowsInMemory(), config.isReportParseExceptions());
    // Temporary directory to hold spilled segments.
    final File persistDir = new File(tmpSegmentDir, theSink.getSegment().getIdentifier());
    // Set of spilled segments. Will be merged at the end.
    final Set<File> spilled = Sets.newHashSet();
    // IndexMerger implementation.
    final IndexMerger theIndexMerger = config.getBuildV9Directly() ? indexMergerV9 : indexMerger;
    return new Plumber() {

        @Override
        public Object startJob() {
            return null;
        }

        @Override
        public int add(InputRow row, Supplier<Committer> committerSupplier) throws IndexSizeExceededException {
            Sink sink = getSink(row.getTimestampFromEpoch());
            if (sink == null) {
                return -1;
            }
            final int numRows = sink.add(row);
            if (!sink.canAppendRow()) {
                persist(committerSupplier.get());
            }
            return numRows;
        }

        private Sink getSink(long timestamp) {
            if (theSink.getInterval().contains(timestamp)) {
                return theSink;
            } else {
                return null;
            }
        }

        @Override
        public <T> QueryRunner<T> getQueryRunner(Query<T> query) {
            throw new UnsupportedOperationException("Don't query me, bro.");
        }

        @Override
        public void persist(Committer committer) {
            spillIfSwappable();
            committer.run();
        }

        @Override
        public void finishJob() {
            // The segment we will upload
            File fileToUpload = null;
            try {
                // User should have persisted everything by now.
                Preconditions.checkState(!theSink.swappable(), "All data must be persisted before fininshing the job!");
                if (spilled.size() == 0) {
                    throw new IllegalStateException("Nothing indexed?");
                } else if (spilled.size() == 1) {
                    fileToUpload = Iterables.getOnlyElement(spilled);
                } else {
                    List<QueryableIndex> indexes = Lists.newArrayList();
                    for (final File oneSpill : spilled) {
                        indexes.add(indexIO.loadIndex(oneSpill));
                    }
                    fileToUpload = new File(tmpSegmentDir, "merged");
                    theIndexMerger.mergeQueryableIndex(indexes, schema.getGranularitySpec().isRollup(), schema.getAggregators(), fileToUpload, config.getIndexSpec());
                }
                // Map merged segment so we can extract dimensions
                final QueryableIndex mappedSegment = indexIO.loadIndex(fileToUpload);
                final DataSegment segmentToUpload = theSink.getSegment().withDimensions(ImmutableList.copyOf(mappedSegment.getAvailableDimensions())).withBinaryVersion(SegmentUtils.getVersionFromDir(fileToUpload));
                dataSegmentPusher.push(fileToUpload, segmentToUpload);
                log.info("Uploaded segment[%s]", segmentToUpload.getIdentifier());
            } catch (Exception e) {
                log.warn(e, "Failed to merge and upload");
                throw Throwables.propagate(e);
            } finally {
                try {
                    if (fileToUpload != null) {
                        log.info("Deleting Index File[%s]", fileToUpload);
                        FileUtils.deleteDirectory(fileToUpload);
                    }
                } catch (IOException e) {
                    log.warn(e, "Error deleting directory[%s]", fileToUpload);
                }
            }
        }

        private void spillIfSwappable() {
            if (theSink.swappable()) {
                final FireHydrant indexToPersist = theSink.swap();
                final int rowsToPersist = indexToPersist.getIndex().size();
                final File dirToPersist = getSpillDir(indexToPersist.getCount());
                log.info("Spilling index[%d] with rows[%d] to: %s", indexToPersist.getCount(), rowsToPersist, dirToPersist);
                try {
                    theIndexMerger.persist(indexToPersist.getIndex(), dirToPersist, config.getIndexSpec());
                    indexToPersist.swapSegment(null);
                    metrics.incrementRowOutputCount(rowsToPersist);
                    spilled.add(dirToPersist);
                } catch (Exception e) {
                    log.warn(e, "Failed to spill index[%d]", indexToPersist.getCount());
                    throw Throwables.propagate(e);
                }
            }
        }

        private File getSpillDir(final int n) {
            return new File(persistDir, String.format("spill%d", n));
        }
    };
}
Also used : IndexMerger(io.druid.segment.IndexMerger) Query(io.druid.query.Query) IOException(java.io.IOException) DataSegment(io.druid.timeline.DataSegment) IndexSizeExceededException(io.druid.segment.incremental.IndexSizeExceededException) IOException(java.io.IOException) Sink(io.druid.segment.realtime.plumber.Sink) QueryableIndex(io.druid.segment.QueryableIndex) InputRow(io.druid.data.input.InputRow) Plumber(io.druid.segment.realtime.plumber.Plumber) Supplier(com.google.common.base.Supplier) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) Committer(io.druid.data.input.Committer) FireHydrant(io.druid.segment.realtime.FireHydrant) File(java.io.File)

Aggregations

InputRow (io.druid.data.input.InputRow)81 Test (org.junit.Test)35 MapBasedInputRow (io.druid.data.input.MapBasedInputRow)24 BenchmarkDataGenerator (io.druid.benchmark.datagen.BenchmarkDataGenerator)22 File (java.io.File)18 Setup (org.openjdk.jmh.annotations.Setup)15 HyperUniquesSerde (io.druid.query.aggregation.hyperloglog.HyperUniquesSerde)14 Firehose (io.druid.data.input.Firehose)12 OnheapIncrementalIndex (io.druid.segment.incremental.OnheapIncrementalIndex)12 IndexSpec (io.druid.segment.IndexSpec)11 ArrayList (java.util.ArrayList)11 IncrementalIndex (io.druid.segment.incremental.IncrementalIndex)10 DateTime (org.joda.time.DateTime)10 QueryableIndex (io.druid.segment.QueryableIndex)9 IOException (java.io.IOException)9 BenchmarkColumnSchema (io.druid.benchmark.datagen.BenchmarkColumnSchema)8 Interval (org.joda.time.Interval)8 ParseException (io.druid.java.util.common.parsers.ParseException)7 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)6 DataSegment (io.druid.timeline.DataSegment)5