use of org.apache.druid.timeline.partition.PartialShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentsForHashBasedNumberedShardSpec.
@Test
public void testAllocatePendingSegmentsForHashBasedNumberedShardSpec() throws IOException {
final PartialShardSpec partialShardSpec = new HashBasedNumberedPartialShardSpec(null, 2, 5, null);
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
SegmentIdWithShardSpec id = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
HashBasedNumberedShardSpec shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(0, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(5, shardSpec.getNumBuckets());
coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
id = coordinator.allocatePendingSegment(dataSource, "seq2", null, interval, partialShardSpec, "version", true);
shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(1, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(5, shardSpec.getNumBuckets());
coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
id = coordinator.allocatePendingSegment(dataSource, "seq3", null, interval, new HashBasedNumberedPartialShardSpec(null, 2, 3, null), "version", true);
shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(2, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(3, shardSpec.getNumBuckets());
}
use of org.apache.druid.timeline.partition.PartialShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentAfterDroppingExistingSegment.
/**
* This test simulates an issue detected on the field consisting of the following sequence of events:
* - A kafka stream segment was created on a given interval
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later a compaction was issued for the three segments above
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later, the compacted segment got dropped due to a drop rule
* - Later, after the above was dropped, another segment on same interval was created by the stream but this
* time there was an integrity violation in the pending segments table because the
* {@link IndexerSQLMetadataStorageCoordinator#createNewSegment(Handle, String, Interval, PartialShardSpec, String)}
* method returned an segment id that already existed in the pending segments table
*/
@Test
public void testAllocatePendingSegmentAfterDroppingExistingSegment() {
String maxVersion = "version_newer_newer";
// simulate one load using kafka streaming
final PartialShardSpec partialShardSpec = NumberedPartialShardSpec.instance();
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
final SegmentIdWithShardSpec identifier = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version", identifier.toString());
// simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
final SegmentIdWithShardSpec identifier1 = coordinator.allocatePendingSegment(dataSource, "seq2", identifier.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_1", identifier1.toString());
// simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
final SegmentIdWithShardSpec identifier2 = coordinator.allocatePendingSegment(dataSource, "seq3", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_2", identifier2.toString());
// now simulate that one compaction was done (batch) ingestion for same interval (like reindex of the previous three):
DataSegment segment = new DataSegment("ds", Intervals.of("2017-01-01T00Z/2017-02-01T00Z"), "version_new", ImmutableMap.of(), ImmutableList.of("dim1"), ImmutableList.of("m1"), new LinearShardSpec(0), 9, 100);
Assert.assertTrue(insertUsedSegments(ImmutableSet.of(segment)));
List<String> ids = retrieveUsedSegmentIds();
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new", ids.get(0));
// one more load on same interval:
final SegmentIdWithShardSpec identifier3 = coordinator.allocatePendingSegment(dataSource, "seq4", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_1", identifier3.toString());
// now drop the used segment previously loaded:
markAllSegmentsUnused(ImmutableSet.of(segment));
// and final load, this reproduces an issue that could happen with multiple streaming appends,
// followed by a reindex, followed by a drop, and more streaming data coming in for same interval
final SegmentIdWithShardSpec identifier4 = coordinator.allocatePendingSegment(dataSource, "seq5", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_2", identifier4.toString());
}
use of org.apache.druid.timeline.partition.PartialShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testDeletePendingSegment.
@Test
public void testDeletePendingSegment() throws InterruptedException {
final PartialShardSpec partialShardSpec = NumberedPartialShardSpec.instance();
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
String prevSegmentId = null;
final DateTime begin = DateTimes.nowUtc();
for (int i = 0; i < 10; i++) {
final SegmentIdWithShardSpec identifier = coordinator.allocatePendingSegment(dataSource, "seq", prevSegmentId, interval, partialShardSpec, "version", false);
prevSegmentId = identifier.toString();
}
Thread.sleep(100);
final DateTime secondBegin = DateTimes.nowUtc();
for (int i = 0; i < 5; i++) {
final SegmentIdWithShardSpec identifier = coordinator.allocatePendingSegment(dataSource, "seq", prevSegmentId, interval, partialShardSpec, "version", false);
prevSegmentId = identifier.toString();
}
final int numDeleted = coordinator.deletePendingSegmentsCreatedInInterval(dataSource, new Interval(begin, secondBegin));
Assert.assertEquals(10, numDeleted);
}
use of org.apache.druid.timeline.partition.PartialShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegment.
@Test
public void testAllocatePendingSegment() {
final PartialShardSpec partialShardSpec = NumberedPartialShardSpec.instance();
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
final SegmentIdWithShardSpec identifier = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", false);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version", identifier.toString());
final SegmentIdWithShardSpec identifier1 = coordinator.allocatePendingSegment(dataSource, "seq", identifier.toString(), interval, partialShardSpec, identifier.getVersion(), false);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_1", identifier1.toString());
final SegmentIdWithShardSpec identifier2 = coordinator.allocatePendingSegment(dataSource, "seq", identifier1.toString(), interval, partialShardSpec, identifier1.getVersion(), false);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_2", identifier2.toString());
final SegmentIdWithShardSpec identifier3 = coordinator.allocatePendingSegment(dataSource, "seq", identifier1.toString(), interval, partialShardSpec, identifier1.getVersion(), false);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_2", identifier3.toString());
Assert.assertEquals(identifier2, identifier3);
final SegmentIdWithShardSpec identifier4 = coordinator.allocatePendingSegment(dataSource, "seq1", null, interval, partialShardSpec, "version", false);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_3", identifier4.toString());
}
use of org.apache.druid.timeline.partition.PartialShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinator method createNewSegment.
/**
* This function creates a new segment for the given datasource/interval/etc. A critical
* aspect of the creation is to make sure that the new version & new partition number will make
* sense given the existing segments & pending segments also very important is to avoid
* clashes with existing pending & used/unused segments.
* @param handle Database handle
* @param dataSource datasource for the new segment
* @param interval interval for the new segment
* @param partialShardSpec Shard spec info minus segment id stuff
* @param existingVersion Version of segments in interval, used to compute the version of the very first segment in
* interval
* @return
* @throws IOException
*/
@Nullable
private SegmentIdWithShardSpec createNewSegment(final Handle handle, final String dataSource, final Interval interval, final PartialShardSpec partialShardSpec, final String existingVersion) throws IOException {
// Get the time chunk and associated data segments for the given interval, if any
final List<TimelineObjectHolder<String, DataSegment>> existingChunks = getTimelineForIntervalsWithHandle(handle, dataSource, ImmutableList.of(interval)).lookup(interval);
if (existingChunks.size() > 1) {
// Not possible to expand more than one chunk with a single segment.
log.warn("Cannot allocate new segment for dataSource[%s], interval[%s]: already have [%,d] chunks.", dataSource, interval, existingChunks.size());
return null;
} else {
// max partitionId of the shardSpecs which share the same partition space.
SegmentIdWithShardSpec maxId = null;
if (!existingChunks.isEmpty()) {
TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
// noinspection ConstantConditions
for (DataSegment segment : FluentIterable.from(existingHolder.getObject()).transform(PartitionChunk::getObject).filter(segment -> segment.getShardSpec().sharePartitionSpace(partialShardSpec))) {
// Note that this will compute the max id of existing, visible, data segments in the time chunk:
if (maxId == null || maxId.getShardSpec().getPartitionNum() < segment.getShardSpec().getPartitionNum()) {
maxId = SegmentIdWithShardSpec.fromDataSegment(segment);
}
}
}
// Get the version of the existing chunk, we might need it in some of the cases below
// to compute the new identifier's version
@Nullable final String versionOfExistingChunk;
if (!existingChunks.isEmpty()) {
// remember only one chunk possible for given interval so get the first & only one
versionOfExistingChunk = existingChunks.get(0).getVersion();
} else {
versionOfExistingChunk = null;
}
// next, we need to enrich the maxId computed before with the information of the pending segments
// it is possible that a pending segment has a higher id in which case we need that, it will work,
// and it will avoid clashes when inserting the new pending segment later in the caller of this method
final Set<SegmentIdWithShardSpec> pendings = getPendingSegmentsForIntervalWithHandle(handle, dataSource, interval);
// Make sure we add the maxId we obtained from the segments table:
if (maxId != null) {
pendings.add(maxId);
}
// Now compute the maxId with all the information: pendings + segments:
// The versionOfExistingChunks filter is ensure that we pick the max id with the version of the existing chunk
// in the case that there may be a pending segment with a higher version but no corresponding used segments
// which may generate a clash with an existing segment once the new id is generated
maxId = pendings.stream().filter(id -> id.getShardSpec().sharePartitionSpace(partialShardSpec)).filter(id -> versionOfExistingChunk == null ? true : id.getVersion().equals(versionOfExistingChunk)).max((id1, id2) -> {
final int versionCompare = id1.getVersion().compareTo(id2.getVersion());
if (versionCompare != 0) {
return versionCompare;
} else {
return Integer.compare(id1.getShardSpec().getPartitionNum(), id2.getShardSpec().getPartitionNum());
}
}).orElse(null);
// The following code attempts to compute the new version, if this
// new version is not null at the end of next block then it will be
// used as the new version in the case for initial or appended segment
final String newSegmentVersion;
if (versionOfExistingChunk != null) {
// segment version overrides, so pick that now that we know it exists
newSegmentVersion = versionOfExistingChunk;
} else if (!pendings.isEmpty() && maxId != null) {
// there is no visible segments in the time chunk, so pick the maxId of pendings, as computed above
newSegmentVersion = maxId.getVersion();
} else {
// no segments, no pendings, so this must be the very first segment created for this interval
newSegmentVersion = null;
}
if (maxId == null) {
// When appending segments, null maxId means that we are allocating the very initial
// segment for this time chunk.
// This code is executed when the Overlord coordinates segment allocation, which is either you append segments
// or you use segment lock. Since the core partitions set is not determined for appended segments, we set
// it 0. When you use segment lock, the core partitions set doesn't work with it. We simply set it 0 so that the
// OvershadowableManager handles the atomic segment update.
final int newPartitionId = partialShardSpec.useNonRootGenerationPartitionSpace() ? PartitionIds.NON_ROOT_GEN_START_PARTITION_ID : PartitionIds.ROOT_GEN_START_PARTITION_ID;
String version = newSegmentVersion == null ? existingVersion : newSegmentVersion;
return new SegmentIdWithShardSpec(dataSource, interval, version, partialShardSpec.complete(jsonMapper, newPartitionId, 0));
} else if (!maxId.getInterval().equals(interval) || maxId.getVersion().compareTo(existingVersion) > 0) {
log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], existingVersion[%s]: conflicting segment[%s].", dataSource, interval, existingVersion, maxId);
return null;
} else if (maxId.getShardSpec().getNumCorePartitions() == SingleDimensionShardSpec.UNKNOWN_NUM_CORE_PARTITIONS) {
log.warn("Cannot allocate new segment because of unknown core partition size of segment[%s], shardSpec[%s]", maxId, maxId.getShardSpec());
return null;
} else {
return new SegmentIdWithShardSpec(dataSource, maxId.getInterval(), Preconditions.checkNotNull(newSegmentVersion, "newSegmentVersion"), partialShardSpec.complete(jsonMapper, maxId.getShardSpec().getPartitionNum() + 1, maxId.getShardSpec().getNumCorePartitions()));
}
}
}
Aggregations