Examples with SegmentIdentifier - io.druid.segment.realtime.appenderator.SegmentIdentifier

Example 11 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project hive by apache.

the class DruidRecordWriter method pushSegments.

private void pushSegments(List<SegmentIdentifier> segmentsToPush) {
    try {
        SegmentsAndMetadata segmentsAndMetadata = appenderator.push(segmentsToPush, committerSupplier.get()).get();
        final HashSet<String> pushedSegmentIdentifierHashSet = new HashSet<>();
        for (DataSegment pushedSegment : segmentsAndMetadata.getSegments()) {
            pushedSegmentIdentifierHashSet.add(SegmentIdentifier.fromDataSegment(pushedSegment).getIdentifierAsString());
            final Path segmentDescriptorOutputPath = DruidStorageHandlerUtils.makeSegmentDescriptorOutputPath(pushedSegment, segmentsDescriptorDir);
            DruidStorageHandlerUtils.writeSegmentDescriptor(fileSystem, pushedSegment, segmentDescriptorOutputPath);
            LOG.info(String.format("Pushed the segment [%s] and persisted the descriptor located at [%s]", pushedSegment, segmentDescriptorOutputPath));
        }
        final HashSet<String> toPushSegmentsHashSet = new HashSet(FluentIterable.from(segmentsToPush).transform(new Function<SegmentIdentifier, String>() {

            @Nullable
            @Override
            public String apply(@Nullable SegmentIdentifier input) {
                return input.getIdentifierAsString();
            }
        }).toList());
        if (!pushedSegmentIdentifierHashSet.equals(toPushSegmentsHashSet)) {
            throw new IllegalStateException(String.format("was asked to publish [%s] but was able to publish only [%s]", Joiner.on(", ").join(toPushSegmentsHashSet), Joiner.on(", ").join(pushedSegmentIdentifierHashSet)));
        }
        for (SegmentIdentifier dataSegmentId : segmentsToPush) {
            LOG.info("Dropping segment {}", dataSegmentId.toString());
            appenderator.drop(dataSegmentId).get();
        }
        LOG.info(String.format("Published [%,d] segments.", segmentsToPush.size()));
    } catch (InterruptedException e) {
        LOG.error(String.format("got interrupted, failed to push  [%,d] segments.", segmentsToPush.size()), e);
        Thread.currentThread().interrupt();
    } catch (IOException | ExecutionException e) {
        LOG.error(String.format("Failed to push  [%,d] segments.", segmentsToPush.size()), e);
        Throwables.propagate(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) SegmentsAndMetadata(io.druid.segment.realtime.appenderator.SegmentsAndMetadata) IOException(java.io.IOException) DataSegment(io.druid.timeline.DataSegment) ExecutionException(java.util.concurrent.ExecutionException) Nullable(javax.annotation.Nullable) HashSet(java.util.HashSet)

Example 12 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.

the class SegmentAllocateAction method perform.

@Override
public SegmentIdentifier perform(final Task task, final TaskActionToolbox toolbox) throws IOException {
    int attempt = 0;
    while (true) {
        attempt++;
        if (!task.getDataSource().equals(dataSource)) {
            throw new IAE("Task dataSource must match action dataSource, [%s] != [%s].", task.getDataSource(), dataSource);
        }
        final IndexerMetadataStorageCoordinator msc = toolbox.getIndexerMetadataStorageCoordinator();
        // 1) if something overlaps our timestamp, use that
        // 2) otherwise try preferredSegmentGranularity & going progressively smaller
        final List<Interval> tryIntervals = Lists.newArrayList();
        final Interval rowInterval = queryGranularity.bucket(timestamp);
        final Set<DataSegment> usedSegmentsForRow = ImmutableSet.copyOf(msc.getUsedSegmentsForInterval(dataSource, rowInterval));
        if (usedSegmentsForRow.isEmpty()) {
            // segment granularity. Try that first, and then progressively smaller ones if it fails.
            for (Granularity gran : Granularity.granularitiesFinerThan(preferredSegmentGranularity)) {
                tryIntervals.add(gran.bucket(timestamp));
            }
        } else {
            // Existing segment(s) exist for this row; use the interval of the first one.
            tryIntervals.add(usedSegmentsForRow.iterator().next().getInterval());
        }
        for (final Interval tryInterval : tryIntervals) {
            if (tryInterval.contains(rowInterval)) {
                log.debug("Trying to allocate pending segment for rowInterval[%s], segmentInterval[%s].", rowInterval, tryInterval);
                final TaskLock tryLock = toolbox.getTaskLockbox().tryLock(task, tryInterval).orNull();
                if (tryLock != null) {
                    final SegmentIdentifier identifier = msc.allocatePendingSegment(dataSource, sequenceName, previousSegmentId, tryInterval, tryLock.getVersion());
                    if (identifier != null) {
                        return identifier;
                    } else {
                        log.debug("Could not allocate pending segment for rowInterval[%s], segmentInterval[%s].", rowInterval, tryInterval);
                    }
                } else {
                    log.debug("Could not acquire lock for rowInterval[%s], segmentInterval[%s].", rowInterval, tryInterval);
                }
            }
        }
        if (!ImmutableSet.copyOf(msc.getUsedSegmentsForInterval(dataSource, rowInterval)).equals(usedSegmentsForRow)) {
            if (attempt < MAX_ATTEMPTS) {
                final long shortRandomSleep = 50 + (long) (Math.random() * 450);
                log.debug("Used segment set changed for rowInterval[%s]. Retrying segment allocation in %,dms (attempt = %,d).", rowInterval, shortRandomSleep, attempt);
                try {
                    Thread.sleep(shortRandomSleep);
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                    throw Throwables.propagate(e);
                }
            } else {
                log.error("Used segment set changed for rowInterval[%s]. Not trying again (attempt = %,d).", rowInterval, attempt);
                return null;
            }
        } else {
            return null;
        }
    }
}

Also used : IndexerMetadataStorageCoordinator(io.druid.indexing.overlord.IndexerMetadataStorageCoordinator) TaskLock(io.druid.indexing.common.TaskLock) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) IAE(io.druid.java.util.common.IAE) Granularity(io.druid.java.util.common.granularity.Granularity) DataSegment(io.druid.timeline.DataSegment) Interval(org.joda.time.Interval)

Example 13 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinator method allocatePendingSegment.

@Override
public SegmentIdentifier allocatePendingSegment(final String dataSource, final String sequenceName, final String previousSegmentId, final Interval interval, final String maxVersion) throws IOException {
    Preconditions.checkNotNull(dataSource, "dataSource");
    Preconditions.checkNotNull(sequenceName, "sequenceName");
    Preconditions.checkNotNull(interval, "interval");
    Preconditions.checkNotNull(maxVersion, "maxVersion");
    final String previousSegmentIdNotNull = previousSegmentId == null ? "" : previousSegmentId;
    return connector.retryTransaction(new TransactionCallback<SegmentIdentifier>() {

        @Override
        public SegmentIdentifier inTransaction(Handle handle, TransactionStatus transactionStatus) throws Exception {
            final List<byte[]> existingBytes = handle.createQuery(String.format("SELECT payload FROM %s WHERE " + "dataSource = :dataSource AND " + "sequence_name = :sequence_name AND " + "sequence_prev_id = :sequence_prev_id", dbTables.getPendingSegmentsTable())).bind("dataSource", dataSource).bind("sequence_name", sequenceName).bind("sequence_prev_id", previousSegmentIdNotNull).map(ByteArrayMapper.FIRST).list();
            if (!existingBytes.isEmpty()) {
                final SegmentIdentifier existingIdentifier = jsonMapper.readValue(Iterables.getOnlyElement(existingBytes), SegmentIdentifier.class);
                if (existingIdentifier.getInterval().getStartMillis() == interval.getStartMillis() && existingIdentifier.getInterval().getEndMillis() == interval.getEndMillis()) {
                    log.info("Found existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB", existingIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull);
                    return existingIdentifier;
                } else {
                    log.warn("Cannot use existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB, " + "does not match requested interval[%s]", existingIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull, interval);
                    return null;
                }
            }
            // Make up a pending segment based on existing segments and pending segments in the DB. This works
            // assuming that all tasks inserting segments at a particular point in time are going through the
            // allocatePendingSegment flow. This should be assured through some other mechanism (like task locks).
            final SegmentIdentifier newIdentifier;
            final List<TimelineObjectHolder<String, DataSegment>> existingChunks = getTimelineForIntervalsWithHandle(handle, dataSource, ImmutableList.of(interval)).lookup(interval);
            if (existingChunks.size() > 1) {
                // Not possible to expand more than one chunk with a single segment.
                log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: already have [%,d] chunks.", dataSource, interval, maxVersion, existingChunks.size());
                return null;
            } else {
                SegmentIdentifier max = null;
                if (!existingChunks.isEmpty()) {
                    TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
                    for (PartitionChunk<DataSegment> existing : existingHolder.getObject()) {
                        if (max == null || max.getShardSpec().getPartitionNum() < existing.getObject().getShardSpec().getPartitionNum()) {
                            max = SegmentIdentifier.fromDataSegment(existing.getObject());
                        }
                    }
                }
                final List<SegmentIdentifier> pendings = getPendingSegmentsForIntervalWithHandle(handle, dataSource, interval);
                for (SegmentIdentifier pending : pendings) {
                    if (max == null || pending.getVersion().compareTo(max.getVersion()) > 0 || (pending.getVersion().equals(max.getVersion()) && pending.getShardSpec().getPartitionNum() > max.getShardSpec().getPartitionNum())) {
                        max = pending;
                    }
                }
                if (max == null) {
                    newIdentifier = new SegmentIdentifier(dataSource, interval, maxVersion, new NumberedShardSpec(0, 0));
                } else if (!max.getInterval().equals(interval) || max.getVersion().compareTo(maxVersion) > 0) {
                    log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: conflicting segment[%s].", dataSource, interval, maxVersion, max.getIdentifierAsString());
                    return null;
                } else if (max.getShardSpec() instanceof LinearShardSpec) {
                    newIdentifier = new SegmentIdentifier(dataSource, max.getInterval(), max.getVersion(), new LinearShardSpec(max.getShardSpec().getPartitionNum() + 1));
                } else if (max.getShardSpec() instanceof NumberedShardSpec) {
                    newIdentifier = new SegmentIdentifier(dataSource, max.getInterval(), max.getVersion(), new NumberedShardSpec(max.getShardSpec().getPartitionNum() + 1, ((NumberedShardSpec) max.getShardSpec()).getPartitions()));
                } else {
                    log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: ShardSpec class[%s] used by [%s].", dataSource, interval, maxVersion, max.getShardSpec().getClass(), max.getIdentifierAsString());
                    return null;
                }
            }
            // SELECT -> INSERT can fail due to races; callers must be prepared to retry.
            // Avoiding ON DUPLICATE KEY since it's not portable.
            // Avoiding try/catch since it may cause inadvertent transaction-splitting.
            // UNIQUE key for the row, ensuring sequences do not fork in two directions.
            // Using a single column instead of (sequence_name, sequence_prev_id) as some MySQL storage engines
            // have difficulty with large unique keys (see https://github.com/druid-io/druid/issues/2319)
            final String sequenceNamePrevIdSha1 = BaseEncoding.base16().encode(Hashing.sha1().newHasher().putBytes(StringUtils.toUtf8(sequenceName)).putByte((byte) 0xff).putBytes(StringUtils.toUtf8(previousSegmentIdNotNull)).hash().asBytes());
            handle.createStatement(String.format("INSERT INTO %1$s (id, dataSource, created_date, start, %2$send%2$s, sequence_name, sequence_prev_id, sequence_name_prev_id_sha1, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :sequence_name, :sequence_prev_id, :sequence_name_prev_id_sha1, :payload)", dbTables.getPendingSegmentsTable(), connector.getQuoteString())).bind("id", newIdentifier.getIdentifierAsString()).bind("dataSource", dataSource).bind("created_date", new DateTime().toString()).bind("start", interval.getStart().toString()).bind("end", interval.getEnd().toString()).bind("sequence_name", sequenceName).bind("sequence_prev_id", previousSegmentIdNotNull).bind("sequence_name_prev_id_sha1", sequenceNamePrevIdSha1).bind("payload", jsonMapper.writeValueAsBytes(newIdentifier)).execute();
            log.info("Allocated pending segment [%s] for sequence[%s] (previous = [%s]) in DB", newIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull);
            return newIdentifier;
        }
    }, ALLOCATE_SEGMENT_QUIET_TRIES, SQLMetadataConnector.DEFAULT_MAX_TRIES);
}

Also used : SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) TransactionStatus(org.skife.jdbi.v2.TransactionStatus) DataSegment(io.druid.timeline.DataSegment) SQLException(java.sql.SQLException) IOException(java.io.IOException) CallbackFailedException(org.skife.jdbi.v2.exceptions.CallbackFailedException) DateTime(org.joda.time.DateTime) Handle(org.skife.jdbi.v2.Handle) TimelineObjectHolder(io.druid.timeline.TimelineObjectHolder) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) PartitionChunk(io.druid.timeline.partition.PartitionChunk) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec)

Example 14 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinator method getPendingSegmentsForIntervalWithHandle.

private List<SegmentIdentifier> getPendingSegmentsForIntervalWithHandle(final Handle handle, final String dataSource, final Interval interval) throws IOException {
    final List<SegmentIdentifier> identifiers = Lists.newArrayList();
    final ResultIterator<byte[]> dbSegments = handle.createQuery(String.format("SELECT payload FROM %1$s WHERE dataSource = :dataSource AND start <= :end and %2$send%2$s >= :start", dbTables.getPendingSegmentsTable(), connector.getQuoteString())).bind("dataSource", dataSource).bind("start", interval.getStart().toString()).bind("end", interval.getEnd().toString()).map(ByteArrayMapper.FIRST).iterator();
    while (dbSegments.hasNext()) {
        final byte[] payload = dbSegments.next();
        final SegmentIdentifier identifier = jsonMapper.readValue(payload, SegmentIdentifier.class);
        if (interval.overlaps(identifier.getInterval())) {
            identifiers.add(identifier);
        }
    }
    dbSegments.close();
    return identifiers;
}

Also used : SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier)

Example 15 with SegmentIdentifier

use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.

the class ActionBasedUsedSegmentCheckerTest method testBasic.

@Test
public void testBasic() throws IOException {
    final TaskActionClient taskActionClient = EasyMock.createMock(TaskActionClient.class);
    EasyMock.expect(taskActionClient.submit(new SegmentListUsedAction("bar", null, ImmutableList.of(new Interval("2002/P1D"))))).andReturn(ImmutableList.of(DataSegment.builder().dataSource("bar").interval(new Interval("2002/P1D")).shardSpec(new LinearShardSpec(0)).version("b").build(), DataSegment.builder().dataSource("bar").interval(new Interval("2002/P1D")).shardSpec(new LinearShardSpec(1)).version("b").build()));
    EasyMock.expect(taskActionClient.submit(new SegmentListUsedAction("foo", null, ImmutableList.of(new Interval("2000/P1D"), new Interval("2001/P1D"))))).andReturn(ImmutableList.of(DataSegment.builder().dataSource("foo").interval(new Interval("2000/P1D")).shardSpec(new LinearShardSpec(0)).version("a").build(), DataSegment.builder().dataSource("foo").interval(new Interval("2000/P1D")).shardSpec(new LinearShardSpec(1)).version("a").build(), DataSegment.builder().dataSource("foo").interval(new Interval("2001/P1D")).shardSpec(new LinearShardSpec(1)).version("b").build(), DataSegment.builder().dataSource("foo").interval(new Interval("2002/P1D")).shardSpec(new LinearShardSpec(1)).version("b").build()));
    EasyMock.replay(taskActionClient);
    final UsedSegmentChecker checker = new ActionBasedUsedSegmentChecker(taskActionClient);
    final Set<DataSegment> segments = checker.findUsedSegments(ImmutableSet.of(new SegmentIdentifier("foo", new Interval("2000/P1D"), "a", new LinearShardSpec(1)), new SegmentIdentifier("foo", new Interval("2001/P1D"), "b", new LinearShardSpec(0)), new SegmentIdentifier("bar", new Interval("2002/P1D"), "b", new LinearShardSpec(0))));
    Assert.assertEquals(ImmutableSet.of(DataSegment.builder().dataSource("foo").interval(new Interval("2000/P1D")).shardSpec(new LinearShardSpec(1)).version("a").build(), DataSegment.builder().dataSource("bar").interval(new Interval("2002/P1D")).shardSpec(new LinearShardSpec(0)).version("b").build()), segments);
    EasyMock.verify(taskActionClient);
}

Also used : TaskActionClient(io.druid.indexing.common.actions.TaskActionClient) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) UsedSegmentChecker(io.druid.segment.realtime.appenderator.UsedSegmentChecker) SegmentListUsedAction(io.druid.indexing.common.actions.SegmentListUsedAction) DataSegment(io.druid.timeline.DataSegment) Interval(org.joda.time.Interval) Test(org.junit.Test)

Aggregations

SegmentIdentifier (io.druid.segment.realtime.appenderator.SegmentIdentifier)20 Test (org.junit.Test)11 NoopTask (io.druid.indexing.common.task.NoopTask)10 Task (io.druid.indexing.common.task.Task)10 NumberedShardSpec (io.druid.timeline.partition.NumberedShardSpec)10 DataSegment (io.druid.timeline.DataSegment)8 Interval (org.joda.time.Interval)6 TaskLock (io.druid.indexing.common.TaskLock)5 LinearShardSpec (io.druid.timeline.partition.LinearShardSpec)4 DateTime (org.joda.time.DateTime)4 Predicate (com.google.common.base.Predicate)3 SegmentTransactionalInsertAction (io.druid.indexing.common.actions.SegmentTransactionalInsertAction)3 SegmentsAndMetadata (io.druid.segment.realtime.appenderator.SegmentsAndMetadata)3 Set (java.util.Set)3 Committer (io.druid.data.input.Committer)2 InputRow (io.druid.data.input.InputRow)2 SegmentListUsedAction (io.druid.indexing.common.actions.SegmentListUsedAction)2 TaskActionClient (io.druid.indexing.common.actions.TaskActionClient)2 ISE (io.druid.java.util.common.ISE)2 ParseException (io.druid.java.util.common.parsers.ParseException)2