Search in sources :

Example 1 with SegmentIdWithShardSpec

use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinator method getPendingSegmentsForIntervalWithHandle.

private Set<SegmentIdWithShardSpec> getPendingSegmentsForIntervalWithHandle(final Handle handle, final String dataSource, final Interval interval) throws IOException {
    final Set<SegmentIdWithShardSpec> identifiers = new HashSet<>();
    final ResultIterator<byte[]> dbSegments = handle.createQuery(StringUtils.format("SELECT payload FROM %1$s WHERE dataSource = :dataSource AND start <= :end and %2$send%2$s >= :start", dbTables.getPendingSegmentsTable(), connector.getQuoteString())).bind("dataSource", dataSource).bind("start", interval.getStart().toString()).bind("end", interval.getEnd().toString()).map(ByteArrayMapper.FIRST).iterator();
    while (dbSegments.hasNext()) {
        final byte[] payload = dbSegments.next();
        final SegmentIdWithShardSpec identifier = jsonMapper.readValue(payload, SegmentIdWithShardSpec.class);
        if (interval.overlaps(identifier.getInterval())) {
            identifiers.add(identifier);
        }
    }
    dbSegments.close();
    return identifiers;
}
Also used : SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) HashSet(java.util.HashSet)

Example 2 with SegmentIdWithShardSpec

use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinator method checkAndGetExistingSegmentId.

private CheckExistingSegmentIdResult checkAndGetExistingSegmentId(final Query<Map<String, Object>> query, final Interval interval, final String sequenceName, @Nullable final String previousSegmentId, final Pair<String, String>... queryVars) throws IOException {
    Query<Map<String, Object>> boundQuery = query;
    for (Pair<String, String> var : queryVars) {
        boundQuery = boundQuery.bind(var.lhs, var.rhs);
    }
    final List<byte[]> existingBytes = boundQuery.map(ByteArrayMapper.FIRST).list();
    if (!existingBytes.isEmpty()) {
        final SegmentIdWithShardSpec existingIdentifier = jsonMapper.readValue(Iterables.getOnlyElement(existingBytes), SegmentIdWithShardSpec.class);
        if (existingIdentifier.getInterval().getStartMillis() == interval.getStartMillis() && existingIdentifier.getInterval().getEndMillis() == interval.getEndMillis()) {
            if (previousSegmentId == null) {
                log.info("Found existing pending segment [%s] for sequence[%s] in DB", existingIdentifier, sequenceName);
            } else {
                log.info("Found existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB", existingIdentifier, sequenceName, previousSegmentId);
            }
            return new CheckExistingSegmentIdResult(true, existingIdentifier);
        } else {
            if (previousSegmentId == null) {
                log.warn("Cannot use existing pending segment [%s] for sequence[%s] in DB, " + "does not match requested interval[%s]", existingIdentifier, sequenceName, interval);
            } else {
                log.warn("Cannot use existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB, " + "does not match requested interval[%s]", existingIdentifier, sequenceName, previousSegmentId, interval);
            }
            return new CheckExistingSegmentIdResult(true, null);
        }
    }
    return new CheckExistingSegmentIdResult(false, null);
}
Also used : Map(java.util.Map) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec)

Example 3 with SegmentIdWithShardSpec

use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinator method allocatePendingSegmentWithSegmentLineageCheck.

@Nullable
private SegmentIdWithShardSpec allocatePendingSegmentWithSegmentLineageCheck(final Handle handle, final String dataSource, final String sequenceName, @Nullable final String previousSegmentId, final Interval interval, final PartialShardSpec partialShardSpec, final String maxVersion) throws IOException {
    final String previousSegmentIdNotNull = previousSegmentId == null ? "" : previousSegmentId;
    final CheckExistingSegmentIdResult result = checkAndGetExistingSegmentId(handle.createQuery(StringUtils.format("SELECT payload FROM %s WHERE " + "dataSource = :dataSource AND " + "sequence_name = :sequence_name AND " + "sequence_prev_id = :sequence_prev_id", dbTables.getPendingSegmentsTable())), interval, sequenceName, previousSegmentIdNotNull, Pair.of("dataSource", dataSource), Pair.of("sequence_name", sequenceName), Pair.of("sequence_prev_id", previousSegmentIdNotNull));
    if (result.found) {
        // The found existing segment identifier can be null if its interval doesn't match with the given interval
        return result.segmentIdentifier;
    }
    final SegmentIdWithShardSpec newIdentifier = createNewSegment(handle, dataSource, interval, partialShardSpec, maxVersion);
    if (newIdentifier == null) {
        return null;
    }
    // SELECT -> INSERT can fail due to races; callers must be prepared to retry.
    // Avoiding ON DUPLICATE KEY since it's not portable.
    // Avoiding try/catch since it may cause inadvertent transaction-splitting.
    // UNIQUE key for the row, ensuring sequences do not fork in two directions.
    // Using a single column instead of (sequence_name, sequence_prev_id) as some MySQL storage engines
    // have difficulty with large unique keys (see https://github.com/apache/druid/issues/2319)
    final String sequenceNamePrevIdSha1 = BaseEncoding.base16().encode(Hashing.sha1().newHasher().putBytes(StringUtils.toUtf8(sequenceName)).putByte((byte) 0xff).putBytes(StringUtils.toUtf8(previousSegmentIdNotNull)).hash().asBytes());
    insertToMetastore(handle, newIdentifier, dataSource, interval, previousSegmentIdNotNull, sequenceName, sequenceNamePrevIdSha1);
    return newIdentifier;
}
Also used : SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) Nullable(javax.annotation.Nullable)

Example 4 with SegmentIdWithShardSpec

use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentsForHashBasedNumberedShardSpec.

@Test
public void testAllocatePendingSegmentsForHashBasedNumberedShardSpec() throws IOException {
    final PartialShardSpec partialShardSpec = new HashBasedNumberedPartialShardSpec(null, 2, 5, null);
    final String dataSource = "ds";
    final Interval interval = Intervals.of("2017-01-01/2017-02-01");
    SegmentIdWithShardSpec id = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
    HashBasedNumberedShardSpec shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
    Assert.assertEquals(0, shardSpec.getPartitionNum());
    Assert.assertEquals(0, shardSpec.getNumCorePartitions());
    Assert.assertEquals(5, shardSpec.getNumBuckets());
    coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
    id = coordinator.allocatePendingSegment(dataSource, "seq2", null, interval, partialShardSpec, "version", true);
    shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
    Assert.assertEquals(1, shardSpec.getPartitionNum());
    Assert.assertEquals(0, shardSpec.getNumCorePartitions());
    Assert.assertEquals(5, shardSpec.getNumBuckets());
    coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
    id = coordinator.allocatePendingSegment(dataSource, "seq3", null, interval, new HashBasedNumberedPartialShardSpec(null, 2, 3, null), "version", true);
    shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
    Assert.assertEquals(2, shardSpec.getPartitionNum());
    Assert.assertEquals(0, shardSpec.getNumCorePartitions());
    Assert.assertEquals(3, shardSpec.getNumBuckets());
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashBasedNumberedPartialShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedPartialShardSpec) PartialShardSpec(org.apache.druid.timeline.partition.PartialShardSpec) NumberedPartialShardSpec(org.apache.druid.timeline.partition.NumberedPartialShardSpec) NumberedOverwritePartialShardSpec(org.apache.druid.timeline.partition.NumberedOverwritePartialShardSpec) HashBasedNumberedPartialShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedPartialShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) DataSegment(org.apache.druid.timeline.DataSegment) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 5 with SegmentIdWithShardSpec

use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentAfterDroppingExistingSegment.

/**
 * This test simulates an issue detected on the field consisting of the following sequence of events:
 * - A kafka stream segment was created on a given interval
 * - Later, after the above was published, another segment on same interval was created by the stream
 * - Later, after the above was published, another segment on same interval was created by the stream
 * - Later a compaction was issued for the three segments above
 * - Later, after the above was published, another segment on same interval was created by the stream
 * - Later, the compacted segment got dropped due to a drop rule
 * - Later, after the above was dropped, another segment on same interval was created by the stream but this
 *   time there was an integrity violation in the pending segments table because the
 *   {@link IndexerSQLMetadataStorageCoordinator#createNewSegment(Handle, String, Interval, PartialShardSpec, String)}
 *   method returned an segment id that already existed in the pending segments table
 */
@Test
public void testAllocatePendingSegmentAfterDroppingExistingSegment() {
    String maxVersion = "version_newer_newer";
    // simulate one load using kafka streaming
    final PartialShardSpec partialShardSpec = NumberedPartialShardSpec.instance();
    final String dataSource = "ds";
    final Interval interval = Intervals.of("2017-01-01/2017-02-01");
    final SegmentIdWithShardSpec identifier = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version", identifier.toString());
    // simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
    final SegmentIdWithShardSpec identifier1 = coordinator.allocatePendingSegment(dataSource, "seq2", identifier.toString(), interval, partialShardSpec, maxVersion, true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_1", identifier1.toString());
    // simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
    final SegmentIdWithShardSpec identifier2 = coordinator.allocatePendingSegment(dataSource, "seq3", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_2", identifier2.toString());
    // now simulate that one compaction was done (batch) ingestion for same interval (like reindex of the previous three):
    DataSegment segment = new DataSegment("ds", Intervals.of("2017-01-01T00Z/2017-02-01T00Z"), "version_new", ImmutableMap.of(), ImmutableList.of("dim1"), ImmutableList.of("m1"), new LinearShardSpec(0), 9, 100);
    Assert.assertTrue(insertUsedSegments(ImmutableSet.of(segment)));
    List<String> ids = retrieveUsedSegmentIds();
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new", ids.get(0));
    // one more load on same interval:
    final SegmentIdWithShardSpec identifier3 = coordinator.allocatePendingSegment(dataSource, "seq4", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_1", identifier3.toString());
    // now drop the used segment previously loaded:
    markAllSegmentsUnused(ImmutableSet.of(segment));
    // and final load, this reproduces an issue that could happen with multiple streaming appends,
    // followed by a reindex, followed by a drop, and more streaming data coming in for same interval
    final SegmentIdWithShardSpec identifier4 = coordinator.allocatePendingSegment(dataSource, "seq5", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_2", identifier4.toString());
}
Also used : LinearShardSpec(org.apache.druid.timeline.partition.LinearShardSpec) HashBasedNumberedPartialShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedPartialShardSpec) PartialShardSpec(org.apache.druid.timeline.partition.PartialShardSpec) NumberedPartialShardSpec(org.apache.druid.timeline.partition.NumberedPartialShardSpec) NumberedOverwritePartialShardSpec(org.apache.druid.timeline.partition.NumberedOverwritePartialShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) DataSegment(org.apache.druid.timeline.DataSegment) Interval(org.joda.time.Interval) Test(org.junit.Test)

Aggregations

SegmentIdWithShardSpec (org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec)40 Test (org.junit.Test)23 Interval (org.joda.time.Interval)17 DataSegment (org.apache.druid.timeline.DataSegment)16 NoopTask (org.apache.druid.indexing.common.task.NoopTask)12 Task (org.apache.druid.indexing.common.task.Task)12 LinearShardSpec (org.apache.druid.timeline.partition.LinearShardSpec)12 PartialShardSpec (org.apache.druid.timeline.partition.PartialShardSpec)11 HashBasedNumberedPartialShardSpec (org.apache.druid.timeline.partition.HashBasedNumberedPartialShardSpec)10 NumberedPartialShardSpec (org.apache.druid.timeline.partition.NumberedPartialShardSpec)10 HashBasedNumberedShardSpec (org.apache.druid.timeline.partition.HashBasedNumberedShardSpec)9 NumberedShardSpec (org.apache.druid.timeline.partition.NumberedShardSpec)9 DateTime (org.joda.time.DateTime)9 IOException (java.io.IOException)8 HashSet (java.util.HashSet)7 NumberedOverwritePartialShardSpec (org.apache.druid.timeline.partition.NumberedOverwritePartialShardSpec)7 Map (java.util.Map)6 ShardSpec (org.apache.druid.timeline.partition.ShardSpec)6 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)5 Iterables (com.google.common.collect.Iterables)5