Search in sources :

Example 26 with DataSegment

use of org.apache.druid.timeline.DataSegment in project druid by druid-io.

the class SqlSegmentsMetadataQuery method retrieveSegments.

private CloseableIterator<DataSegment> retrieveSegments(final String dataSource, final Collection<Interval> intervals, final IntervalMode matchMode, final boolean used) {
    // Check if the intervals all support comparing as strings. If so, bake them into the SQL.
    final boolean compareAsString = intervals.stream().allMatch(Intervals::canCompareEndpointsAsStrings);
    final StringBuilder sb = new StringBuilder();
    sb.append("SELECT payload FROM %s WHERE used = :used AND dataSource = :dataSource");
    if (compareAsString && !intervals.isEmpty()) {
        sb.append(" AND (");
        for (int i = 0; i < intervals.size(); i++) {
            sb.append(matchMode.makeSqlCondition(connector.getQuoteString(), StringUtils.format(":start%d", i), StringUtils.format(":end%d", i)));
            if (i == intervals.size() - 1) {
                sb.append(")");
            } else {
                sb.append(" OR ");
            }
        }
    }
    final Query<Map<String, Object>> sql = handle.createQuery(StringUtils.format(sb.toString(), dbTables.getSegmentsTable())).setFetchSize(connector.getStreamingFetchSize()).bind("used", used).bind("dataSource", dataSource);
    if (compareAsString) {
        final Iterator<Interval> iterator = intervals.iterator();
        for (int i = 0; iterator.hasNext(); i++) {
            Interval interval = iterator.next();
            sql.bind(StringUtils.format("start%d", i), interval.getStart().toString()).bind(StringUtils.format("end%d", i), interval.getEnd().toString());
        }
    }
    final ResultIterator<DataSegment> resultIterator = sql.map((index, r, ctx) -> JacksonUtils.readValue(jsonMapper, r.getBytes(1), DataSegment.class)).iterator();
    return CloseableIterators.wrap(Iterators.filter(resultIterator, dataSegment -> {
        if (intervals.isEmpty()) {
            return true;
        } else {
            // segment interval like "20010/20011".)
            for (Interval interval : intervals) {
                if (matchMode.apply(interval, dataSegment.getInterval())) {
                    return true;
                }
            }
            return false;
        }
    }), resultIterator);
}
Also used : Intervals(org.apache.druid.java.util.common.Intervals) Logger(org.apache.druid.java.util.common.logger.Logger) JacksonUtils(org.apache.druid.java.util.common.jackson.JacksonUtils) Iterator(java.util.Iterator) Intervals(org.apache.druid.java.util.common.Intervals) Collection(java.util.Collection) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) StringUtils(org.apache.druid.java.util.common.StringUtils) Query(org.skife.jdbi.v2.Query) Collectors(java.util.stream.Collectors) Iterators(com.google.common.collect.Iterators) Interval(org.joda.time.Interval) List(java.util.List) CloseableIterators(org.apache.druid.java.util.common.CloseableIterators) Handle(org.skife.jdbi.v2.Handle) ImmutableList(com.google.common.collect.ImmutableList) Map(java.util.Map) DataSegment(org.apache.druid.timeline.DataSegment) ResultIterator(org.skife.jdbi.v2.ResultIterator) PreparedBatch(org.skife.jdbi.v2.PreparedBatch) IAE(org.apache.druid.java.util.common.IAE) SegmentId(org.apache.druid.timeline.SegmentId) CloseableIterator(org.apache.druid.java.util.common.parsers.CloseableIterator) Collections(java.util.Collections) Map(java.util.Map) DataSegment(org.apache.druid.timeline.DataSegment) Interval(org.joda.time.Interval)

Example 27 with DataSegment

use of org.apache.druid.timeline.DataSegment in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinator method announceHistoricalSegmentBatch.

/**
 * Attempts to insert a single segment to the database. If the segment already exists, will do nothing; although,
 * this checking is imperfect and callers must be prepared to retry their entire transaction on exceptions.
 *
 * @return DataSegment set inserted
 */
private Set<DataSegment> announceHistoricalSegmentBatch(final Handle handle, final Set<DataSegment> segments, final Set<DataSegment> usedSegments) throws IOException {
    final Set<DataSegment> toInsertSegments = new HashSet<>();
    try {
        Set<String> existedSegments = segmentExistsBatch(handle, segments);
        log.info("Found these segments already exist in DB: %s", existedSegments);
        for (DataSegment segment : segments) {
            if (!existedSegments.contains(segment.getId().toString())) {
                toInsertSegments.add(segment);
            }
        }
        // SELECT -> INSERT can fail due to races; callers must be prepared to retry.
        // Avoiding ON DUPLICATE KEY since it's not portable.
        // Avoiding try/catch since it may cause inadvertent transaction-splitting.
        final List<List<DataSegment>> partitionedSegments = Lists.partition(new ArrayList<>(toInsertSegments), MAX_NUM_SEGMENTS_TO_ANNOUNCE_AT_ONCE);
        PreparedBatch preparedBatch = handle.prepareBatch(StringUtils.format("INSERT INTO %1$s (id, dataSource, created_date, start, %2$send%2$s, partitioned, version, used, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :partitioned, :version, :used, :payload)", dbTables.getSegmentsTable(), connector.getQuoteString()));
        for (List<DataSegment> partition : partitionedSegments) {
            for (DataSegment segment : partition) {
                preparedBatch.add().bind("id", segment.getId().toString()).bind("dataSource", segment.getDataSource()).bind("created_date", DateTimes.nowUtc().toString()).bind("start", segment.getInterval().getStart().toString()).bind("end", segment.getInterval().getEnd().toString()).bind("partitioned", (segment.getShardSpec() instanceof NoneShardSpec) ? false : true).bind("version", segment.getVersion()).bind("used", usedSegments.contains(segment)).bind("payload", jsonMapper.writeValueAsBytes(segment));
            }
            final int[] affectedRows = preparedBatch.execute();
            final boolean succeeded = Arrays.stream(affectedRows).allMatch(eachAffectedRows -> eachAffectedRows == 1);
            if (succeeded) {
                log.infoSegments(partition, "Published segments to DB");
            } else {
                final List<DataSegment> failedToPublish = IntStream.range(0, partition.size()).filter(i -> affectedRows[i] != 1).mapToObj(partition::get).collect(Collectors.toList());
                throw new ISE("Failed to publish segments to DB: %s", SegmentUtils.commaSeparatedIdentifiers(failedToPublish));
            }
        }
    } catch (Exception e) {
        log.errorSegments(segments, "Exception inserting segments");
        throw e;
    }
    return toInsertSegments;
}
Also used : NoneShardSpec(org.apache.druid.timeline.partition.NoneShardSpec) DataSegment(org.apache.druid.timeline.DataSegment) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) IOException(java.io.IOException) CallbackFailedException(org.skife.jdbi.v2.exceptions.CallbackFailedException) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) PreparedBatch(org.skife.jdbi.v2.PreparedBatch) ISE(org.apache.druid.java.util.common.ISE) HashSet(java.util.HashSet)

Example 28 with DataSegment

use of org.apache.druid.timeline.DataSegment in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentsForHashBasedNumberedShardSpec.

@Test
public void testAllocatePendingSegmentsForHashBasedNumberedShardSpec() throws IOException {
    final PartialShardSpec partialShardSpec = new HashBasedNumberedPartialShardSpec(null, 2, 5, null);
    final String dataSource = "ds";
    final Interval interval = Intervals.of("2017-01-01/2017-02-01");
    SegmentIdWithShardSpec id = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
    HashBasedNumberedShardSpec shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
    Assert.assertEquals(0, shardSpec.getPartitionNum());
    Assert.assertEquals(0, shardSpec.getNumCorePartitions());
    Assert.assertEquals(5, shardSpec.getNumBuckets());
    coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
    id = coordinator.allocatePendingSegment(dataSource, "seq2", null, interval, partialShardSpec, "version", true);
    shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
    Assert.assertEquals(1, shardSpec.getPartitionNum());
    Assert.assertEquals(0, shardSpec.getNumCorePartitions());
    Assert.assertEquals(5, shardSpec.getNumBuckets());
    coordinator.announceHistoricalSegments(Collections.singleton(new DataSegment(id.getDataSource(), id.getInterval(), id.getVersion(), null, Collections.emptyList(), Collections.emptyList(), id.getShardSpec(), 0, 10L)));
    id = coordinator.allocatePendingSegment(dataSource, "seq3", null, interval, new HashBasedNumberedPartialShardSpec(null, 2, 3, null), "version", true);
    shardSpec = (HashBasedNumberedShardSpec) id.getShardSpec();
    Assert.assertEquals(2, shardSpec.getPartitionNum());
    Assert.assertEquals(0, shardSpec.getNumCorePartitions());
    Assert.assertEquals(3, shardSpec.getNumBuckets());
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashBasedNumberedPartialShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedPartialShardSpec) PartialShardSpec(org.apache.druid.timeline.partition.PartialShardSpec) NumberedPartialShardSpec(org.apache.druid.timeline.partition.NumberedPartialShardSpec) NumberedOverwritePartialShardSpec(org.apache.druid.timeline.partition.NumberedOverwritePartialShardSpec) HashBasedNumberedPartialShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedPartialShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) DataSegment(org.apache.druid.timeline.DataSegment) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 29 with DataSegment

use of org.apache.druid.timeline.DataSegment in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinatorTest method testAllocatePendingSegmentAfterDroppingExistingSegment.

/**
 * This test simulates an issue detected on the field consisting of the following sequence of events:
 * - A kafka stream segment was created on a given interval
 * - Later, after the above was published, another segment on same interval was created by the stream
 * - Later, after the above was published, another segment on same interval was created by the stream
 * - Later a compaction was issued for the three segments above
 * - Later, after the above was published, another segment on same interval was created by the stream
 * - Later, the compacted segment got dropped due to a drop rule
 * - Later, after the above was dropped, another segment on same interval was created by the stream but this
 *   time there was an integrity violation in the pending segments table because the
 *   {@link IndexerSQLMetadataStorageCoordinator#createNewSegment(Handle, String, Interval, PartialShardSpec, String)}
 *   method returned an segment id that already existed in the pending segments table
 */
@Test
public void testAllocatePendingSegmentAfterDroppingExistingSegment() {
    String maxVersion = "version_newer_newer";
    // simulate one load using kafka streaming
    final PartialShardSpec partialShardSpec = NumberedPartialShardSpec.instance();
    final String dataSource = "ds";
    final Interval interval = Intervals.of("2017-01-01/2017-02-01");
    final SegmentIdWithShardSpec identifier = coordinator.allocatePendingSegment(dataSource, "seq", null, interval, partialShardSpec, "version", true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version", identifier.toString());
    // simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
    final SegmentIdWithShardSpec identifier1 = coordinator.allocatePendingSegment(dataSource, "seq2", identifier.toString(), interval, partialShardSpec, maxVersion, true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_1", identifier1.toString());
    // simulate one more load using kafka streaming (as if previous segment was published, note different sequence name)
    final SegmentIdWithShardSpec identifier2 = coordinator.allocatePendingSegment(dataSource, "seq3", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_2", identifier2.toString());
    // now simulate that one compaction was done (batch) ingestion for same interval (like reindex of the previous three):
    DataSegment segment = new DataSegment("ds", Intervals.of("2017-01-01T00Z/2017-02-01T00Z"), "version_new", ImmutableMap.of(), ImmutableList.of("dim1"), ImmutableList.of("m1"), new LinearShardSpec(0), 9, 100);
    Assert.assertTrue(insertUsedSegments(ImmutableSet.of(segment)));
    List<String> ids = retrieveUsedSegmentIds();
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new", ids.get(0));
    // one more load on same interval:
    final SegmentIdWithShardSpec identifier3 = coordinator.allocatePendingSegment(dataSource, "seq4", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_1", identifier3.toString());
    // now drop the used segment previously loaded:
    markAllSegmentsUnused(ImmutableSet.of(segment));
    // and final load, this reproduces an issue that could happen with multiple streaming appends,
    // followed by a reindex, followed by a drop, and more streaming data coming in for same interval
    final SegmentIdWithShardSpec identifier4 = coordinator.allocatePendingSegment(dataSource, "seq5", identifier1.toString(), interval, partialShardSpec, maxVersion, true);
    Assert.assertEquals("ds_2017-01-01T00:00:00.000Z_2017-02-01T00:00:00.000Z_version_new_2", identifier4.toString());
}
Also used : LinearShardSpec(org.apache.druid.timeline.partition.LinearShardSpec) HashBasedNumberedPartialShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedPartialShardSpec) PartialShardSpec(org.apache.druid.timeline.partition.PartialShardSpec) NumberedPartialShardSpec(org.apache.druid.timeline.partition.NumberedPartialShardSpec) NumberedOverwritePartialShardSpec(org.apache.druid.timeline.partition.NumberedOverwritePartialShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) DataSegment(org.apache.druid.timeline.DataSegment) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 30 with DataSegment

use of org.apache.druid.timeline.DataSegment in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinatorTest method testSimpleAnnounce.

@Test
public void testSimpleAnnounce() throws IOException {
    coordinator.announceHistoricalSegments(SEGMENTS);
    for (DataSegment segment : SEGMENTS) {
        Assert.assertArrayEquals(mapper.writeValueAsString(segment).getBytes(StandardCharsets.UTF_8), derbyConnector.lookup(derbyConnectorRule.metadataTablesConfigSupplier().get().getSegmentsTable(), "id", "payload", segment.getId().toString()));
    }
    Assert.assertEquals(ImmutableList.of(defaultSegment.getId().toString(), defaultSegment2.getId().toString()), retrieveUsedSegmentIds());
    // Should not update dataSource metadata.
    Assert.assertEquals(0, metadataUpdateCounter.get());
}
Also used : DataSegment(org.apache.druid.timeline.DataSegment) Test(org.junit.Test)

Aggregations

DataSegment (org.apache.druid.timeline.DataSegment)612 Test (org.junit.Test)386 ArrayList (java.util.ArrayList)161 Interval (org.joda.time.Interval)158 File (java.io.File)138 Map (java.util.Map)110 List (java.util.List)108 ImmutableList (com.google.common.collect.ImmutableList)77 IOException (java.io.IOException)77 HashMap (java.util.HashMap)74 ImmutableMap (com.google.common.collect.ImmutableMap)72 NumberedShardSpec (org.apache.druid.timeline.partition.NumberedShardSpec)68 HashSet (java.util.HashSet)58 TaskStatus (org.apache.druid.indexer.TaskStatus)53 Collectors (java.util.stream.Collectors)52 Set (java.util.Set)50 CountDownLatch (java.util.concurrent.CountDownLatch)50 ISE (org.apache.druid.java.util.common.ISE)50 SegmentId (org.apache.druid.timeline.SegmentId)47 LinearShardSpec (org.apache.druid.timeline.partition.LinearShardSpec)45