use of org.apache.druid.timeline.DataSegment in project druid by druid-io.
the class SqlSegmentsMetadataQuery method retrieveSegments.
private CloseableIterator<DataSegment> retrieveSegments(final String dataSource, final Collection<Interval> intervals, final IntervalMode matchMode, final boolean used) {
// Check if the intervals all support comparing as strings. If so, bake them into the SQL.
final boolean compareAsString = intervals.stream().allMatch(Intervals::canCompareEndpointsAsStrings);
final StringBuilder sb = new StringBuilder();
sb.append("SELECT payload FROM %s WHERE used = :used AND dataSource = :dataSource");
if (compareAsString && !intervals.isEmpty()) {
sb.append(" AND (");
for (int i = 0; i < intervals.size(); i++) {
sb.append(matchMode.makeSqlCondition(connector.getQuoteString(), StringUtils.format(":start%d", i), StringUtils.format(":end%d", i)));
if (i == intervals.size() - 1) {
sb.append(")");
} else {
sb.append(" OR ");
}
}
}
final Query<Map<String, Object>> sql = handle.createQuery(StringUtils.format(sb.toString(), dbTables.getSegmentsTable())).setFetchSize(connector.getStreamingFetchSize()).bind("used", used).bind("dataSource", dataSource);
if (compareAsString) {
final Iterator<Interval> iterator = intervals.iterator();
for (int i = 0; iterator.hasNext(); i++) {
Interval interval = iterator.next();
sql.bind(StringUtils.format("start%d", i), interval.getStart().toString()).bind(StringUtils.format("end%d", i), interval.getEnd().toString());
}
}
final ResultIterator<DataSegment> resultIterator = sql.map((index, r, ctx) -> JacksonUtils.readValue(jsonMapper, r.getBytes(1), DataSegment.class)).iterator();
return CloseableIterators.wrap(Iterators.filter(resultIterator, dataSegment -> {
if (intervals.isEmpty()) {
return true;
} else {
// segment interval like "20010/20011".)
for (Interval interval : intervals) {
if (matchMode.apply(interval, dataSegment.getInterval())) {
return true;
}
}
return false;
}
}), resultIterator);
}
use of org.apache.druid.timeline.DataSegment in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinator method announceHistoricalSegmentBatch.
/**
* Attempts to insert a single segment to the database. If the segment already exists, will do nothing; although,
* this checking is imperfect and callers must be prepared to retry their entire transaction on exceptions.
*
* @return DataSegment set inserted
*/
private Set<DataSegment> announceHistoricalSegmentBatch(final Handle handle, final Set<DataSegment> segments, final Set<DataSegment> usedSegments) throws IOException {
final Set<DataSegment> toInsertSegments = new HashSet<>();
try {
Set<String> existedSegments = segmentExistsBatch(handle, segments);
log.info("Found these segments already exist in DB: %s", existedSegments);
for (DataSegment segment : segments) {
if (!existedSegments.contains(segment.getId().toString())) {
toInsertSegments.add(segment);
}
}
// SELECT -> INSERT can fail due to races; callers must be prepared to retry.
// Avoiding ON DUPLICATE KEY since it's not portable.
// Avoiding try/catch since it may cause inadvertent transaction-splitting.
final List<List<DataSegment>> partitionedSegments = Lists.partition(new ArrayList<>(toInsertSegments), MAX_NUM_SEGMENTS_TO_ANNOUNCE_AT_ONCE);
PreparedBatch preparedBatch = handle.prepareBatch(StringUtils.format("INSERT INTO %1$s (id, dataSource, created_date, start, %2$send%2$s, partitioned, version, used, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :partitioned, :version, :used, :payload)", dbTables.getSegmentsTable(), connector.getQuoteString()));
for (List<DataSegment> partition : partitionedSegments) {
for (DataSegment segment : partition) {
preparedBatch.add().bind("id", segment.getId().toString()).bind("dataSource", segment.getDataSource()).bind("created_date", DateTimes.nowUtc().toString()).bind("start", segment.getInterval().getStart().toString()).bind("end", segment.getInterval().getEnd().toString()).bind("partitioned", (segment.getShardSpec() instanceof NoneShardSpec) ? false : true).bind("version", segment.getVersion()).bind("used", usedSegments.contains(segment)).bind("payload", jsonMapper.writeValueAsBytes(segment));
}
final int[] affectedRows = preparedBatch.execute();
final boolean succeeded = Arrays.stream(affectedRows).allMatch(eachAffectedRows -> eachAffectedRows == 1);
if (succeeded) {
log.infoSegments(partition, "Published segments to DB");
} else {
final List<DataSegment> failedToPublish = IntStream.range(0, partition.size()).filter(i -> affectedRows[i] != 1).mapToObj(partition::get).collect(Collectors.toList());
throw new ISE("Failed to publish segments to DB: %s", SegmentUtils.commaSeparatedIdentifiers(failedToPublish));
}
}
} catch (Exception e) {
log.errorSegments(segments, "Exception inserting segments");
throw e;
}
return toInsertSegments;
}
use of org.apache.druid.timeline.DataSegment in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentsForHashBasedNumberedShardSpec.
@Test
public void testAllocatePendingSegmentsForHashBasedNumberedShardSpec() throws IOException {
final PartialShardSpec partialShardSpec = new HashBasedNumberedPartialShardSpec(null, 2, 5, null);
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
SegmentIdWithShardSpec id = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
HashBasedNumberedShardSpec shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(0, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(5, shardSpec.getNumBuckets());
coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
id = coordinator.allocatePendingSegment(dataSource, "seq2", null, interval, partialShardSpec, "version", true);
shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(1, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(5, shardSpec.getNumBuckets());
coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
id = coordinator.allocatePendingSegment(dataSource, "seq3", null, interval, new HashBasedNumberedPartialShardSpec(null, 2, 3, null), "version", true);
shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
Assert.assertEquals(2, shardSpec.getPartitionNum());
Assert.assertEquals(0, shardSpec.getNumCorePartitions());
Assert.assertEquals(3, shardSpec.getNumBuckets());
}
use of org.apache.druid.timeline.DataSegment in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentAfterDroppingExistingSegment.
/**
* This test simulates an issue detected on the field consisting of the following sequence of events:
* - A kafka stream segment was created on a given interval
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later a compaction was issued for the three segments above
* - Later, after the above was published, another segment on same interval was created by the stream
* - Later, the compacted segment got dropped due to a drop rule
* - Later, after the above was dropped, another segment on same interval was created by the stream but this
* time there was an integrity violation in the pending segments table because the
* {@link IndexerSQLMetadataStorageCoordinator#createNewSegment(Handle, String, Interval, PartialShardSpec, String)}
* method returned an segment id that already existed in the pending segments table
*/
@Test
public void testAllocatePendingSegmentAfterDroppingExistingSegment() {
String maxVersion = "version_newer_newer";
// simulate one load using kafka streaming
final PartialShardSpec partialShardSpec = NumberedPartialShardSpec.instance();
final String dataSource = "ds";
final Interval interval = Intervals.of("2017-01-01/2017-02-01");
final SegmentIdWithShardSpec identifier = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version", identifier.toString());
// simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
final SegmentIdWithShardSpec identifier1 = coordinator.allocatePendingSegment(dataSource, "seq2", identifier.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_1", identifier1.toString());
// simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
final SegmentIdWithShardSpec identifier2 = coordinator.allocatePendingSegment(dataSource, "seq3", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_2", identifier2.toString());
// now simulate that one compaction was done (batch) ingestion for same interval (like reindex of the previous three):
DataSegment segment = new DataSegment("ds", Intervals.of("2017-01-01T00Z/2017-02-01T00Z"), "version_new", ImmutableMap.of(), ImmutableList.of("dim1"), ImmutableList.of("m1"), new LinearShardSpec(0), 9, 100);
Assert.assertTrue(insertUsedSegments(ImmutableSet.of(segment)));
List<String> ids = retrieveUsedSegmentIds();
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new", ids.get(0));
// one more load on same interval:
final SegmentIdWithShardSpec identifier3 = coordinator.allocatePendingSegment(dataSource, "seq4", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_1", identifier3.toString());
// now drop the used segment previously loaded:
markAllSegmentsUnused(ImmutableSet.of(segment));
// and final load, this reproduces an issue that could happen with multiple streaming appends,
// followed by a reindex, followed by a drop, and more streaming data coming in for same interval
final SegmentIdWithShardSpec identifier4 = coordinator.allocatePendingSegment(dataSource, "seq5", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_2", identifier4.toString());
}
use of org.apache.druid.timeline.DataSegment in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinatorTest method testSimpleAnnounce.
@Test
public void testSimpleAnnounce() throws IOException {
coordinator.announceHistoricalSegments(SEGMENTS);
for (DataSegment segment : SEGMENTS) {
Assert.assertArrayEquals(mapper.writeValueAsString(segment).getBytes(StandardCharsets.UTF_8), derbyConnector.lookup(derbyConnectorRule.metadataTablesConfigSupplier().get().getSegmentsTable(), "id", "payload", segment.getId().toString()));
}
Assert.assertEquals(ImmutableList.of(defaultSegment.getId().toString(), defaultSegment2.getId().toString()), retrieveUsedSegmentIds());
// Should not update dataSource metadata.
Assert.assertEquals(0, metadataUpdateCounter.get());
}
Aggregations