use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentsForHashBasedNumberedShardSpec.
@Test
public void testAllocatePendingSegmentsForHashBasedNumberedShardSpec() throws IOException {
final PartialShardSpec partialShardSpec = new HashBasedNumberedPartialShardSpec(null, 2, 5, null);
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
SegmentIdWithShardSpec id = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
HashBasedNumberedShardSpec shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(0, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(5, shardSpec.getNumBuckets());
coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
id = coordinator.allocatePendingSegment(dataSource, "seq2", null, interval, partialShardSpec, "version", true);
shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(1, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(5, shardSpec.getNumBuckets());
coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
id = coordinator.allocatePendingSegment(dataSource, "seq3", null, interval, new HashBasedNumberedPartialShardSpec(null, 2, 3, null), "version", true);
shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(2, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(3, shardSpec.getNumBuckets());
}
use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentAfterDroppingExistingSegment.
/**
* This test simulates an issue detected on the field consisting of the following sequence of events:
* - A kafka stream segment was created on a given interval
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later a compaction was issued for the three segments above
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later, the compacted segment got dropped due to a drop rule
* - Later, after the above was dropped, another segment on same interval was created by the stream but this
* time there was an integrity violation in the pending segments table because the
* {@link IndexerSQLMetadataStorageCoordinator#createNewSegment(Handle, String, Interval, PartialShardSpec, String)}
* method returned an segment id that already existed in the pending segments table
*/
@Test
public void testAllocatePendingSegmentAfterDroppingExistingSegment() {
String maxVersion = "version_newer_newer";
// simulate one load using kafka streaming
final PartialShardSpec partialShardSpec = NumberedPartialShardSpec.instance();
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
final SegmentIdWithShardSpec identifier = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version", identifier.toString());
// simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
final SegmentIdWithShardSpec identifier1 = coordinator.allocatePendingSegment(dataSource, "seq2", identifier.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_1", identifier1.toString());
// simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
final SegmentIdWithShardSpec identifier2 = coordinator.allocatePendingSegment(dataSource, "seq3", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_2", identifier2.toString());
// now simulate that one compaction was done (batch) ingestion for same interval (like reindex of the previous three):
DataSegment segment = new DataSegment("ds", Intervals.of("2017-01-01T00Z/2017-02-01T00Z"), "version_new", ImmutableMap.of(), ImmutableList.of("dim1"), ImmutableList.of("m1"), new LinearShardSpec(0), 9, 100);
Assert.assertTrue(insertUsedSegments(ImmutableSet.of(segment)));
List<String> ids = retrieveUsedSegmentIds();
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new", ids.get(0));
// one more load on same interval:
final SegmentIdWithShardSpec identifier3 = coordinator.allocatePendingSegment(dataSource, "seq4", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_1", identifier3.toString());
// now drop the used segment previously loaded:
markAllSegmentsUnused(ImmutableSet.of(segment));
// and final load, this reproduces an issue that could happen with multiple streaming appends,
// followed by a reindex, followed by a drop, and more streaming data coming in for same interval
final SegmentIdWithShardSpec identifier4 = coordinator.allocatePendingSegment(dataSource, "seq5", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_2", identifier4.toString());
}
use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class SinglePhaseParallelIndexTaskRunner method allocateNewSegment.
/**
* Allocate a new segment for the given timestamp locally. This method is called when dynamic partitioning is used
* and {@link org.apache.druid.indexing.common.LockGranularity} is {@code TIME_CHUNK}.
*
* The allocation algorithm is similar to the Overlord-based segment allocation. It keeps the segment allocation
* history per sequenceName. If the prevSegmentId is found in the segment allocation history, this method
* returns the next segmentId right after the prevSegmentId in the history. Since the sequenceName is unique
* per {@link SubTaskSpec} (it is the ID of subtaskSpec), this algorithm guarantees that the same set of segmentIds
* are created in the same order for the same subtaskSpec.
*
* @see org.apache.druid.metadata.IndexerSQLMetadataStorageCoordinator#allocatePendingSegmentWithSegmentLineageCheck
*/
public SegmentIdWithShardSpec allocateNewSegment(String dataSource, DateTime timestamp, String sequenceName, @Nullable String prevSegmentId) throws IOException {
NonnullPair<Interval, String> intervalAndVersion = findIntervalAndVersion(timestamp);
MutableObject<SegmentIdWithShardSpec> segmentIdHolder = new MutableObject<>();
sequenceToSegmentIds.compute(sequenceName, (k, v) -> {
final int prevSegmentIdIndex;
final List<String> segmentIds;
if (prevSegmentId == null) {
prevSegmentIdIndex = -1;
segmentIds = v == null ? new ArrayList<>() : v;
} else {
segmentIds = v;
if (segmentIds == null) {
throw new ISE("Can't find previous segmentIds for sequence[%s]", sequenceName);
}
prevSegmentIdIndex = segmentIds.indexOf(prevSegmentId);
if (prevSegmentIdIndex == -1) {
throw new ISE("Can't find previously allocated segmentId[%s] for sequence[%s]", prevSegmentId, sequenceName);
}
}
final int nextSegmentIdIndex = prevSegmentIdIndex + 1;
final SegmentIdWithShardSpec newSegmentId;
if (nextSegmentIdIndex < segmentIds.size()) {
SegmentId segmentId = SegmentId.tryParse(dataSource, segmentIds.get(nextSegmentIdIndex));
if (segmentId == null) {
throw new ISE("Illegal segmentId format [%s]", segmentIds.get(nextSegmentIdIndex));
}
newSegmentId = new SegmentIdWithShardSpec(segmentId.getDataSource(), segmentId.getInterval(), segmentId.getVersion(), new BuildingNumberedShardSpec(segmentId.getPartitionNum()));
} else {
final int partitionNum = Counters.getAndIncrementInt(partitionNumCountersPerInterval, intervalAndVersion.lhs);
newSegmentId = new SegmentIdWithShardSpec(dataSource, intervalAndVersion.lhs, intervalAndVersion.rhs, new BuildingNumberedShardSpec(partitionNum));
segmentIds.add(newSegmentId.toString());
}
segmentIdHolder.setValue(newSegmentId);
return segmentIds;
});
return segmentIdHolder.getValue();
}
use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class TaskLockboxTest method assertAllocatedSegments.
private void assertAllocatedSegments(LockRequestForNewSegment lockRequest, LockResult result) {
Assert.assertTrue(result.isOk());
Assert.assertNotNull(result.getTaskLock());
Assert.assertTrue(result.getTaskLock() instanceof SegmentLock);
Assert.assertNotNull(result.getNewSegmentId());
final SegmentLock segmentLock = (SegmentLock) result.getTaskLock();
final SegmentIdWithShardSpec segmentId = result.getNewSegmentId();
Assert.assertEquals(lockRequest.getType(), segmentLock.getType());
Assert.assertEquals(lockRequest.getGroupId(), segmentLock.getGroupId());
Assert.assertEquals(lockRequest.getDataSource(), segmentLock.getDataSource());
Assert.assertEquals(lockRequest.getInterval(), segmentLock.getInterval());
Assert.assertEquals(lockRequest.getPartialShardSpec().getShardSpecClass(), segmentId.getShardSpec().getClass());
Assert.assertEquals(lockRequest.getPriority(), lockRequest.getPriority());
}
use of org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec in project druid by druid-io.
the class ActionBasedUsedSegmentChecker method findUsedSegments.
@Override
public Set<DataSegment> findUsedSegments(Set<SegmentIdWithShardSpec> segmentIds) throws IOException {
// Group by dataSource
final Map<String, Set<SegmentId>> idsByDataSource = new TreeMap<>();
for (SegmentIdWithShardSpec segmentId : segmentIds) {
idsByDataSource.computeIfAbsent(segmentId.getDataSource(), i -> new HashSet<>()).add(segmentId.asSegmentId());
}
final Set<DataSegment> usedSegments = new HashSet<>();
for (Map.Entry<String, Set<SegmentId>> entry : idsByDataSource.entrySet()) {
String dataSource = entry.getKey();
Set<SegmentId> segmentIdsInDataSource = entry.getValue();
final List<Interval> intervals = JodaUtils.condenseIntervals(Iterables.transform(segmentIdsInDataSource, SegmentId::getInterval));
final Collection<DataSegment> usedSegmentsForIntervals = taskActionClient.submit(new RetrieveUsedSegmentsAction(dataSource, null, intervals, Segments.ONLY_VISIBLE));
for (DataSegment segment : usedSegmentsForIntervals) {
if (segmentIdsInDataSource.contains(segment.getId())) {
usedSegments.add(segment);
}
}
}
return usedSegments;
}
Aggregations