Search in sources :

Example 6 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.

the class SegmentAllocateActionTest method testManySegmentsSameInterval.

@Test
public void testManySegmentsSameInterval() throws Exception {
    final Task task = new NoopTask(null, 0, 0, null, null, null);
    taskActionTestKit.getTaskLockbox().add(task);
    final SegmentIdentifier id1 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", null);
    final SegmentIdentifier id2 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", id1.getIdentifierAsString());
    final SegmentIdentifier id3 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", id2.getIdentifierAsString());
    final TaskLock partyLock = Iterables.getOnlyElement(FluentIterable.from(taskActionTestKit.getTaskLockbox().findLocksForTask(task)).filter(new Predicate<TaskLock>() {

        @Override
        public boolean apply(TaskLock input) {
            return input.getInterval().contains(PARTY_TIME);
        }
    }));
    assertSameIdentifier(id1, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), partyLock.getVersion(), new NumberedShardSpec(0, 0)));
    assertSameIdentifier(id2, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), partyLock.getVersion(), new NumberedShardSpec(1, 0)));
    assertSameIdentifier(id3, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), partyLock.getVersion(), new NumberedShardSpec(2, 0)));
}
Also used : Task(io.druid.indexing.common.task.Task) NoopTask(io.druid.indexing.common.task.NoopTask) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) TaskLock(io.druid.indexing.common.TaskLock) NoopTask(io.druid.indexing.common.task.NoopTask) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) Predicate(com.google.common.base.Predicate) Test(org.junit.Test)

Example 7 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.

the class SegmentAllocateActionTest method testAddToExistingNumberedShardSpecsSameGranularity.

@Test
public void testAddToExistingNumberedShardSpecsSameGranularity() throws Exception {
    final Task task = new NoopTask(null, 0, 0, null, null, null);
    taskActionTestKit.getMetadataStorageCoordinator().announceHistoricalSegments(ImmutableSet.of(DataSegment.builder().dataSource(DATA_SOURCE).interval(Granularities.HOUR.bucket(PARTY_TIME)).version(PARTY_TIME.toString()).shardSpec(new NumberedShardSpec(0, 2)).build(), DataSegment.builder().dataSource(DATA_SOURCE).interval(Granularities.HOUR.bucket(PARTY_TIME)).version(PARTY_TIME.toString()).shardSpec(new NumberedShardSpec(1, 2)).build()));
    taskActionTestKit.getTaskLockbox().add(task);
    final SegmentIdentifier id1 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", null);
    final SegmentIdentifier id2 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", id1.getIdentifierAsString());
    assertSameIdentifier(id1, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), PARTY_TIME.toString(), new NumberedShardSpec(2, 2)));
    assertSameIdentifier(id2, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), PARTY_TIME.toString(), new NumberedShardSpec(3, 2)));
}
Also used : Task(io.druid.indexing.common.task.Task) NoopTask(io.druid.indexing.common.task.NoopTask) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) NoopTask(io.druid.indexing.common.task.NoopTask) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) Test(org.junit.Test)

Example 8 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.

the class IndexTask method generateAndPublishSegments.

private boolean generateAndPublishSegments(final TaskToolbox toolbox, final DataSchema dataSchema, final Map<Interval, List<ShardSpec>> shardSpecs, final String version, final FirehoseFactory firehoseFactory) throws IOException, InterruptedException {
    final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null, null), null);
    final FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
    final Map<String, ShardSpec> sequenceNameToShardSpecMap = Maps.newHashMap();
    if (toolbox.getMonitorScheduler() != null) {
        toolbox.getMonitorScheduler().addMonitor(new RealtimeMetricsMonitor(ImmutableList.of(fireDepartmentForMetrics), ImmutableMap.of(DruidMetrics.TASK_ID, new String[] { getId() })));
    }
    final SegmentAllocator segmentAllocator;
    if (ingestionSchema.getIOConfig().isAppendToExisting()) {
        segmentAllocator = new ActionBasedSegmentAllocator(toolbox.getTaskActionClient(), dataSchema);
    } else {
        segmentAllocator = new SegmentAllocator() {

            @Override
            public SegmentIdentifier allocate(DateTime timestamp, String sequenceName, String previousSegmentId) throws IOException {
                Optional<Interval> interval = granularitySpec.bucketInterval(timestamp);
                if (!interval.isPresent()) {
                    throw new ISE("Could not find interval for timestamp [%s]", timestamp);
                }
                ShardSpec shardSpec = sequenceNameToShardSpecMap.get(sequenceName);
                if (shardSpec == null) {
                    throw new ISE("Could not find ShardSpec for sequenceName [%s]", sequenceName);
                }
                return new SegmentIdentifier(getDataSource(), interval.get(), version, shardSpec);
            }
        };
    }
    try (final Appenderator appenderator = newAppenderator(fireDepartmentMetrics, toolbox, dataSchema);
        final FiniteAppenderatorDriver driver = newDriver(appenderator, toolbox, segmentAllocator, fireDepartmentMetrics);
        final Firehose firehose = firehoseFactory.connect(dataSchema.getParser())) {
        final Supplier<Committer> committerSupplier = Committers.supplierFromFirehose(firehose);
        final Map<Interval, ShardSpecLookup> shardSpecLookups = Maps.newHashMap();
        if (driver.startJob() != null) {
            driver.clear();
        }
        try {
            while (firehose.hasMore()) {
                try {
                    final InputRow inputRow = firehose.nextRow();
                    final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
                    if (!optInterval.isPresent()) {
                        fireDepartmentMetrics.incrementThrownAway();
                        continue;
                    }
                    final Interval interval = optInterval.get();
                    if (!shardSpecLookups.containsKey(interval)) {
                        final List<ShardSpec> intervalShardSpecs = shardSpecs.get(interval);
                        if (intervalShardSpecs == null || intervalShardSpecs.isEmpty()) {
                            throw new ISE("Failed to get shardSpec for interval[%s]", interval);
                        }
                        shardSpecLookups.put(interval, intervalShardSpecs.get(0).getLookup(intervalShardSpecs));
                    }
                    final ShardSpec shardSpec = shardSpecLookups.get(interval).getShardSpec(inputRow.getTimestampFromEpoch(), inputRow);
                    final String sequenceName = String.format("index_%s_%s_%d", interval, version, shardSpec.getPartitionNum());
                    if (!sequenceNameToShardSpecMap.containsKey(sequenceName)) {
                        final ShardSpec shardSpecForPublishing = ingestionSchema.getTuningConfig().isForceExtendableShardSpecs() || ingestionSchema.getIOConfig().isAppendToExisting() ? new NumberedShardSpec(shardSpec.getPartitionNum(), shardSpecs.get(interval).size()) : shardSpec;
                        sequenceNameToShardSpecMap.put(sequenceName, shardSpecForPublishing);
                    }
                    final SegmentIdentifier identifier = driver.add(inputRow, sequenceName, committerSupplier);
                    if (identifier == null) {
                        throw new ISE("Could not allocate segment for row with timestamp[%s]", inputRow.getTimestamp());
                    }
                    fireDepartmentMetrics.incrementProcessed();
                } catch (ParseException e) {
                    if (ingestionSchema.getTuningConfig().isReportParseExceptions()) {
                        throw e;
                    } else {
                        fireDepartmentMetrics.incrementUnparseable();
                    }
                }
            }
        } finally {
            driver.persist(committerSupplier.get());
        }
        final TransactionalSegmentPublisher publisher = new TransactionalSegmentPublisher() {

            @Override
            public boolean publishSegments(Set<DataSegment> segments, Object commitMetadata) throws IOException {
                final SegmentTransactionalInsertAction action = new SegmentTransactionalInsertAction(segments, null, null);
                return toolbox.getTaskActionClient().submit(action).isSuccess();
            }
        };
        final SegmentsAndMetadata published = driver.finish(publisher, committerSupplier.get());
        if (published == null) {
            log.error("Failed to publish segments, aborting!");
            return false;
        } else {
            log.info("Published segments[%s]", Joiner.on(", ").join(Iterables.transform(published.getSegments(), new Function<DataSegment, String>() {

                @Override
                public String apply(DataSegment input) {
                    return input.getIdentifier();
                }
            })));
            return true;
        }
    }
}
Also used : RealtimeIOConfig(io.druid.segment.indexing.RealtimeIOConfig) SortedSet(java.util.SortedSet) Set(java.util.Set) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) ShardSpecLookup(io.druid.timeline.partition.ShardSpecLookup) SegmentTransactionalInsertAction(io.druid.indexing.common.actions.SegmentTransactionalInsertAction) DataSegment(io.druid.timeline.DataSegment) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) DateTime(org.joda.time.DateTime) FireDepartment(io.druid.segment.realtime.FireDepartment) TransactionalSegmentPublisher(io.druid.segment.realtime.appenderator.TransactionalSegmentPublisher) ActionBasedSegmentAllocator(io.druid.indexing.appenderator.ActionBasedSegmentAllocator) ISE(io.druid.java.util.common.ISE) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Optional(com.google.common.base.Optional) Firehose(io.druid.data.input.Firehose) SegmentsAndMetadata(io.druid.segment.realtime.appenderator.SegmentsAndMetadata) IOException(java.io.IOException) FireDepartmentMetrics(io.druid.segment.realtime.FireDepartmentMetrics) Appenderator(io.druid.segment.realtime.appenderator.Appenderator) GranularitySpec(io.druid.segment.indexing.granularity.GranularitySpec) ActionBasedSegmentAllocator(io.druid.indexing.appenderator.ActionBasedSegmentAllocator) SegmentAllocator(io.druid.segment.realtime.appenderator.SegmentAllocator) FiniteAppenderatorDriver(io.druid.segment.realtime.appenderator.FiniteAppenderatorDriver) InputRow(io.druid.data.input.InputRow) RealtimeMetricsMonitor(io.druid.segment.realtime.RealtimeMetricsMonitor) Committer(io.druid.data.input.Committer) ParseException(io.druid.java.util.common.parsers.ParseException) Interval(org.joda.time.Interval)

Example 9 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.

the class ActionBasedUsedSegmentChecker method findUsedSegments.

@Override
public Set<DataSegment> findUsedSegments(Set<SegmentIdentifier> identifiers) throws IOException {
    // Group by dataSource
    final Map<String, Set<SegmentIdentifier>> identifiersByDataSource = Maps.newTreeMap();
    for (SegmentIdentifier identifier : identifiers) {
        if (!identifiersByDataSource.containsKey(identifier.getDataSource())) {
            identifiersByDataSource.put(identifier.getDataSource(), Sets.<SegmentIdentifier>newHashSet());
        }
        identifiersByDataSource.get(identifier.getDataSource()).add(identifier);
    }
    final Set<DataSegment> retVal = Sets.newHashSet();
    for (Map.Entry<String, Set<SegmentIdentifier>> entry : identifiersByDataSource.entrySet()) {
        final List<Interval> intervals = JodaUtils.condenseIntervals(Iterables.transform(entry.getValue(), new Function<SegmentIdentifier, Interval>() {

            @Override
            public Interval apply(SegmentIdentifier input) {
                return input.getInterval();
            }
        }));
        final List<DataSegment> usedSegmentsForIntervals = taskActionClient.submit(new SegmentListUsedAction(entry.getKey(), null, intervals));
        for (DataSegment segment : usedSegmentsForIntervals) {
            if (identifiers.contains(SegmentIdentifier.fromDataSegment(segment))) {
                retVal.add(segment);
            }
        }
    }
    return retVal;
}
Also used : Set(java.util.Set) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) DataSegment(io.druid.timeline.DataSegment) Function(com.google.common.base.Function) SegmentListUsedAction(io.druid.indexing.common.actions.SegmentListUsedAction) Map(java.util.Map) Interval(org.joda.time.Interval)

Example 10 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project hive by apache.

the class DruidRecordWriter method getSegmentIdentifierAndMaybePush.

/**
   * This function computes the segment identifier and push the current open segment
   * The push will occur if max size is reached or the event belongs to the next interval.
   * Note that this function assumes that timestamps are pseudo sorted.
   * This function will close and move to the next segment granularity as soon as
   * an event from the next interval appears. The sorting is done by the previous stage.
   *
   * @return segmentIdentifier with of the truncatedTime and maybe push the current open segment.
   */
private SegmentIdentifier getSegmentIdentifierAndMaybePush(long truncatedTime) {
    final Granularity segmentGranularity = dataSchema.getGranularitySpec().getSegmentGranularity();
    final Interval interval = new Interval(new DateTime(truncatedTime), segmentGranularity.increment(new DateTime(truncatedTime)));
    SegmentIdentifier retVal;
    if (currentOpenSegment == null) {
        retVal = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(0));
        currentOpenSegment = retVal;
        return retVal;
    } else if (currentOpenSegment.getInterval().equals(interval)) {
        retVal = currentOpenSegment;
        int rowCount = appenderator.getRowCount(retVal);
        if (rowCount < maxPartitionSize) {
            return retVal;
        } else {
            retVal = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(currentOpenSegment.getShardSpec().getPartitionNum() + 1));
            pushSegments(Lists.newArrayList(currentOpenSegment));
            LOG.info("Creating new partition for segment {}, partition num {}", retVal.getIdentifierAsString(), retVal.getShardSpec().getPartitionNum());
            currentOpenSegment = retVal;
            return retVal;
        }
    } else {
        retVal = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(0));
        pushSegments(Lists.newArrayList(currentOpenSegment));
        LOG.info("Creating segment {}", retVal.getIdentifierAsString());
        currentOpenSegment = retVal;
        return retVal;
    }
}
Also used : SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) Granularity(com.metamx.common.Granularity) DateTime(org.joda.time.DateTime) Interval(org.joda.time.Interval)

Aggregations

SegmentIdentifier (io.druid.segment.realtime.appenderator.SegmentIdentifier)20 Test (org.junit.Test)11 NoopTask (io.druid.indexing.common.task.NoopTask)10 Task (io.druid.indexing.common.task.Task)10 NumberedShardSpec (io.druid.timeline.partition.NumberedShardSpec)10 DataSegment (io.druid.timeline.DataSegment)8 Interval (org.joda.time.Interval)6 TaskLock (io.druid.indexing.common.TaskLock)5 LinearShardSpec (io.druid.timeline.partition.LinearShardSpec)4 DateTime (org.joda.time.DateTime)4 Predicate (com.google.common.base.Predicate)3 SegmentTransactionalInsertAction (io.druid.indexing.common.actions.SegmentTransactionalInsertAction)3 SegmentsAndMetadata (io.druid.segment.realtime.appenderator.SegmentsAndMetadata)3 Set (java.util.Set)3 Committer (io.druid.data.input.Committer)2 InputRow (io.druid.data.input.InputRow)2 SegmentListUsedAction (io.druid.indexing.common.actions.SegmentListUsedAction)2 TaskActionClient (io.druid.indexing.common.actions.TaskActionClient)2 ISE (io.druid.java.util.common.ISE)2 ParseException (io.druid.java.util.common.parsers.ParseException)2