use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinator method getPendingSegmentsForIntervalWithHandle.
private Set<SegmentIdWithShardSpec> getPendingSegmentsForIntervalWithHandle(final Handle handle, final String dataSource, final Interval interval) throws IOException {
final Set<SegmentIdWithShardSpec> identifiers = new HashSet<>();
final ResultIterator<byte[]> dbSegments = handle.createQuery(StringUtils.format("SELECT payload FROM %1$s WHERE dataSource = :dataSource AND start <= :end and %2$send%2$s >= :start", dbTables.getPendingSegmentsTable(), connector.getQuoteString())).bind("dataSource", dataSource).bind("start", interval.getStart().toString()).bind("end", interval.getEnd().toString()).map(ByteArrayMapper.FIRST).iterator();
while (dbSegments.hasNext()) {
final byte[] payload = dbSegments.next();
final SegmentIdWithShardSpec identifier = jsonMapper.readValue(payload, SegmentIdWithShardSpec.class);
if (interval.overlaps(identifier.getInterval())) {
identifiers.add(identifier);
}
}
dbSegments.close();
return identifiers;
}
use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinator method checkAndGetExistingSegmentId.
private CheckExistingSegmentIdResult checkAndGetExistingSegmentId(final Query<Map<String, Object>> query, final Interval interval, final String sequenceName, @Nullable final String previousSegmentId, final Pair<String, String>... queryVars) throws IOException {
Query<Map<String, Object>> boundQuery = query;
for (Pair<String, String> var : queryVars) {
boundQuery = boundQuery.bind(var.lhs, var.rhs);
}
final List<byte[]> existingBytes = boundQuery.map(ByteArrayMapper.FIRST).list();
if (!existingBytes.isEmpty()) {
final SegmentIdWithShardSpec existingIdentifier = jsonMapper.readValue(Iterables.getOnlyElement(existingBytes), SegmentIdWithShardSpec.class);
if (existingIdentifier.getInterval().getStartMillis() == interval.getStartMillis() && existingIdentifier.getInterval().getEndMillis() == interval.getEndMillis()) {
if (previousSegmentId == null) {
log.info("Found existing pending segment [%s] for sequence[%s] in DB", existingIdentifier, sequenceName);
} else {
log.info("Found existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB", existingIdentifier, sequenceName, previousSegmentId);
}
return new CheckExistingSegmentIdResult(true, existingIdentifier);
} else {
if (previousSegmentId == null) {
log.warn("Cannot use existing pending segment [%s] for sequence[%s] in DB, " + "does not match requested interval[%s]", existingIdentifier, sequenceName, interval);
} else {
log.warn("Cannot use existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB, " + "does not match requested interval[%s]", existingIdentifier, sequenceName, previousSegmentId, interval);
}
return new CheckExistingSegmentIdResult(true, null);
}
}
return new CheckExistingSegmentIdResult(false, null);
}
use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinator method allocatePendingSegmentWithSegmentLineageCheck.
@Nullable
private SegmentIdWithShardSpec allocatePendingSegmentWithSegmentLineageCheck(final Handle handle, final String dataSource, final String sequenceName, @Nullable final String previousSegmentId, final Interval interval, final PartialShardSpec partialShardSpec, final String maxVersion) throws IOException {
final String previousSegmentIdNotNull = previousSegmentId == null ? "" : previousSegmentId;
final CheckExistingSegmentIdResult result = checkAndGetExistingSegmentId(handle.createQuery(StringUtils.format("SELECT payload FROM %s WHERE " + "dataSource = :dataSource AND " + "sequence_name = :sequence_name AND " + "sequence_prev_id = :sequence_prev_id", dbTables.getPendingSegmentsTable())), interval, sequenceName, previousSegmentIdNotNull, Pair.of("dataSource", dataSource), Pair.of("sequence_name", sequenceName), Pair.of("sequence_prev_id", previousSegmentIdNotNull));
if (result.found) {
// The found existing segment identifier can be null if its interval doesn't match with the given interval
return result.segmentIdentifier;
}
final SegmentIdWithShardSpec newIdentifier = createNewSegment(handle, dataSource, interval, partialShardSpec, maxVersion);
if (newIdentifier == null) {
return null;
}
// SELECT -> INSERT can fail due to races; callers must be prepared to retry.
// Avoiding ON DUPLICATE KEY since it's not portable.
// Avoiding try/catch since it may cause inadvertent transaction-splitting.
// UNIQUE key for the row, ensuring sequences do not fork in two directions.
// Using a single column instead of (sequence_name, sequence_prev_id) as some MySQL storage engines
// have difficulty with large unique keys (see https://github.com/apache/druid/issues/2319)
final String sequenceNamePrevIdSha1 = BaseEncoding.base16().encode(Hashing.sha1().newHasher().putBytes(StringUtils.toUtf8(sequenceName)).putByte((byte) 0xff).putBytes(StringUtils.toUtf8(previousSegmentIdNotNull)).hash().asBytes());
insertToMetastore(handle, newIdentifier, dataSource, interval, previousSegmentIdNotNull, sequenceName, sequenceNamePrevIdSha1);
return newIdentifier;
}
use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentsForHashBasedNumberedShardSpec.
@Test
public void testAllocatePendingSegmentsForHashBasedNumberedShardSpec() throws IOException {
final PartialShardSpec partialShardSpec = new HashBasedNumberedPartialShardSpec(null, 2, 5, null);
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
SegmentIdWithShardSpec id = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
HashBasedNumberedShardSpec shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(0, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(5, shardSpec.getNumBuckets());
coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
id = coordinator.allocatePendingSegment(dataSource, "seq2", null, interval, partialShardSpec, "version", true);
shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(1, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(5, shardSpec.getNumBuckets());
coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
id = coordinator.allocatePendingSegment(dataSource, "seq3", null, interval, new HashBasedNumberedPartialShardSpec(null, 2, 3, null), "version", true);
shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(2, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(3, shardSpec.getNumBuckets());
}
use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentAfterDroppingExistingSegment.
/**
* This test simulates an issue detected on the field consisting of the following sequence of events:
* - A kafka stream segment was created on a given interval
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later a compaction was issued for the three segments above
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later, the compacted segment got dropped due to a drop rule
* - Later, after the above was dropped, another segment on same interval was created by the stream but this
* time there was an integrity violation in the pending segments table because the
* {@link IndexerSQLMetadataStorageCoordinator#createNewSegment(Handle, String, Interval, PartialShardSpec, String)}
* method returned an segment id that already existed in the pending segments table
*/
@Test
public void testAllocatePendingSegmentAfterDroppingExistingSegment() {
String maxVersion = "version_newer_newer";
// simulate one load using kafka streaming
final PartialShardSpec partialShardSpec = NumberedPartialShardSpec.instance();
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
final SegmentIdWithShardSpec identifier = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version", identifier.toString());
// simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
final SegmentIdWithShardSpec identifier1 = coordinator.allocatePendingSegment(dataSource, "seq2", identifier.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_1", identifier1.toString());
// simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
final SegmentIdWithShardSpec identifier2 = coordinator.allocatePendingSegment(dataSource, "seq3", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_2", identifier2.toString());
// now simulate that one compaction was done (batch) ingestion for same interval (like reindex of the previous three):
DataSegment segment = new DataSegment("ds", Intervals.of("2017-01-01T00Z/2017-02-01T00Z"), "version_new", ImmutableMap.of(), ImmutableList.of("dim1"), ImmutableList.of("m1"), new LinearShardSpec(0), 9, 100);
Assert.assertTrue(insertUsedSegments(ImmutableSet.of(segment)));
List<String> ids = retrieveUsedSegmentIds();
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new", ids.get(0));
// one more load on same interval:
final SegmentIdWithShardSpec identifier3 = coordinator.allocatePendingSegment(dataSource, "seq4", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_1", identifier3.toString());
// now drop the used segment previously loaded:
markAllSegmentsUnused(ImmutableSet.of(segment));
// and final load, this reproduces an issue that could happen with multiple streaming appends,
// followed by a reindex, followed by a drop, and more streaming data coming in for same interval
final SegmentIdWithShardSpec identifier4 = coordinator.allocatePendingSegment(dataSource, "seq5", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_2", identifier4.toString());
}
Aggregations