Search in sources :

Example 11 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project hive by apache.

the class DruidStorageHandlerUtils method publishSegmentsAndCommit.

/**
 * First computes the segments timeline to accommodate new segments for insert into case.
 * Then moves segments to druid deep storage with updated metadata/version.
 * ALL IS DONE IN ONE TRANSACTION
 *
 * @param connector                   DBI connector to commit
 * @param metadataStorageTablesConfig Druid metadata tables definitions
 * @param dataSource                  Druid datasource name
 * @param segments                    List of segments to move and commit to metadata
 * @param overwrite                   if it is an insert overwrite
 * @param conf                        Configuration
 * @param dataSegmentPusher           segment pusher
 * @return List of successfully published Druid segments.
 * This list has the updated versions and metadata about segments after move and timeline sorting
 * @throws CallbackFailedException in case the connector can not add the segment to the DB.
 */
@SuppressWarnings("unchecked")
static List<DataSegment> publishSegmentsAndCommit(final SQLMetadataConnector connector, final MetadataStorageTablesConfig metadataStorageTablesConfig, final String dataSource, final List<DataSegment> segments, boolean overwrite, Configuration conf, DataSegmentPusher dataSegmentPusher) throws CallbackFailedException {
    return connector.getDBI().inTransaction((handle, transactionStatus) -> {
        // We create the timeline for the existing and new segments
        VersionedIntervalTimeline<String, DataSegment> timeline;
        if (overwrite) {
            // If we are overwriting, we disable existing sources
            disableDataSourceWithHandle(handle, metadataStorageTablesConfig, dataSource);
            // When overwriting, we just start with empty timeline,
            // as we are overwriting segments with new versions
            timeline = new VersionedIntervalTimeline<>(Ordering.natural());
        } else {
            // Append Mode
            if (segments.isEmpty()) {
                // If there are no new segments, we can just bail out
                return Collections.EMPTY_LIST;
            }
            // Otherwise, build a timeline of existing segments in metadata storage
            Interval indexedInterval = JodaUtils.umbrellaInterval(segments.stream().map(DataSegment::getInterval).collect(Collectors.toList()));
            LOG.info("Building timeline for umbrella Interval [{}]", indexedInterval);
            timeline = getTimelineForIntervalWithHandle(handle, dataSource, indexedInterval, metadataStorageTablesConfig);
        }
        final List<DataSegment> finalSegmentsToPublish = Lists.newArrayList();
        for (DataSegment segment : segments) {
            List<TimelineObjectHolder<String, DataSegment>> existingChunks = timeline.lookup(segment.getInterval());
            if (existingChunks.size() > 1) {
                // Druid shard specs does not support multiple partitions for same interval with different granularity.
                throw new IllegalStateException(String.format("Cannot allocate new segment for dataSource[%s], interval[%s], already have [%,d] chunks. " + "Not possible to append new segment.", dataSource, segment.getInterval(), existingChunks.size()));
            }
            // Find out the segment with latest version and maximum partition number
            SegmentIdWithShardSpec max = null;
            final ShardSpec newShardSpec;
            final String newVersion;
            if (!existingChunks.isEmpty()) {
                // Some existing chunk, Find max
                TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
                for (PartitionChunk<DataSegment> existing : existingHolder.getObject()) {
                    if (max == null || max.getShardSpec().getPartitionNum() < existing.getObject().getShardSpec().getPartitionNum()) {
                        max = SegmentIdWithShardSpec.fromDataSegment(existing.getObject());
                    }
                }
            }
            if (max == null) {
                // No existing shard present in the database, use the current version.
                newShardSpec = segment.getShardSpec();
                newVersion = segment.getVersion();
            } else {
                // use version of existing max segment to generate new shard spec
                newShardSpec = getNextPartitionShardSpec(max.getShardSpec());
                newVersion = max.getVersion();
            }
            DataSegment publishedSegment = publishSegmentWithShardSpec(segment, newShardSpec, newVersion, getPath(segment).getFileSystem(conf), dataSegmentPusher);
            finalSegmentsToPublish.add(publishedSegment);
            timeline.add(publishedSegment.getInterval(), publishedSegment.getVersion(), publishedSegment.getShardSpec().createChunk(publishedSegment));
        }
        // Publish new segments to metadata storage
        final PreparedBatch batch = handle.prepareBatch(String.format("INSERT INTO %1$s (id, dataSource, created_date, start, \"end\", partitioned, version, used, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :partitioned, :version, :used, :payload)", metadataStorageTablesConfig.getSegmentsTable()));
        for (final DataSegment segment : finalSegmentsToPublish) {
            batch.add(new ImmutableMap.Builder<String, Object>().put("id", segment.getId().toString()).put("dataSource", segment.getDataSource()).put("created_date", new DateTime().toString()).put("start", segment.getInterval().getStart().toString()).put("end", segment.getInterval().getEnd().toString()).put("partitioned", !(segment.getShardSpec() instanceof NoneShardSpec)).put("version", segment.getVersion()).put("used", true).put("payload", JSON_MAPPER.writeValueAsBytes(segment)).build());
            LOG.info("Published {}", segment.getId().toString());
        }
        batch.execute();
        return finalSegmentsToPublish;
    });
}
Also used : NoneShardSpec(org.apache.druid.timeline.partition.NoneShardSpec) GenericUDFToString(org.apache.hadoop.hive.ql.udf.generic.GenericUDFToString) DataSegment(org.apache.druid.timeline.DataSegment) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) LinearShardSpec(org.apache.druid.timeline.partition.LinearShardSpec) NoneShardSpec(org.apache.druid.timeline.partition.NoneShardSpec) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) ImmutableMap(com.google.common.collect.ImmutableMap) DateTime(org.joda.time.DateTime) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) PreparedBatch(org.skife.jdbi.v2.PreparedBatch) Interval(org.joda.time.Interval)

Example 12 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class NewestSegmentFirstPolicyBenchmark method setup.

@Setup(Level.Trial)
public void setup() {
    compactionConfigs = new HashMap<>();
    for (int i = 0; i < numDataSources; i++) {
        final String dataSource = DATA_SOURCE_PREFIX + i;
        compactionConfigs.put(dataSource, new DataSourceCompactionConfig(dataSource, 0, inputSegmentSizeBytes, null, null, null, null, null, null, null, null, null));
    }
    List<DataSegment> segments = new ArrayList<>();
    for (int i = 0; i < numDataSources; i++) {
        final String dataSource = DATA_SOURCE_PREFIX + i;
        final int startYear = ThreadLocalRandom.current().nextInt(2000, 2040);
        DateTime date = DateTimes.of(startYear, 1, 1, 0, 0);
        for (int j = 0; j < numDayIntervalsPerDataSource; j++, date = date.plusDays(1)) {
            for (int k = 0; k < numPartitionsPerDayInterval; k++) {
                final ShardSpec shardSpec = new NumberedShardSpec(numPartitionsPerDayInterval, k);
                final DataSegment segment = new DataSegment(dataSource, new Interval(date, date.plusDays(1)), "version", null, ImmutableList.of(), ImmutableList.of(), shardSpec, 0, segmentSizeBytes);
                segments.add(segment);
            }
        }
    }
    dataSources = DataSourcesSnapshot.fromUsedSegments(segments, ImmutableMap.of()).getUsedSegmentsTimelinesPerDataSource();
}
Also used : ArrayList(java.util.ArrayList) DataSegment(org.apache.druid.timeline.DataSegment) DateTime(org.joda.time.DateTime) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) Interval(org.joda.time.Interval) Setup(org.openjdk.jmh.annotations.Setup)

Example 13 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class HadoopDruidIndexerConfig method getBucket.

/**
 ******************************************
 *   Granularity/Bucket Helper Methods
 *******************************************
 */
/**
 * Get the proper bucket for some input row.
 *
 * @param inputRow an InputRow
 * @return the Bucket that this row belongs to
 */
Optional<Bucket> getBucket(InputRow inputRow) {
    final Optional<Interval> timeBucket = schema.getDataSchema().getGranularitySpec().bucketInterval(DateTimes.utc(inputRow.getTimestampFromEpoch()));
    if (!timeBucket.isPresent()) {
        return Optional.absent();
    }
    final DateTime bucketStart = timeBucket.get().getStart();
    final ShardSpec actualSpec = shardSpecLookups.get(bucketStart.getMillis()).getShardSpec(rollupGran.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
    final HadoopyShardSpec hadoopyShardSpec = hadoopShardSpecLookup.get(bucketStart.getMillis()).get(actualSpec);
    return Optional.of(new Bucket(hadoopyShardSpec.getShardNum(), bucketStart, actualSpec.getPartitionNum()));
}
Also used : DateTime(org.joda.time.DateTime) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) Interval(org.joda.time.Interval)

Example 14 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class SegmentManager method dropSegment.

public void dropSegment(final DataSegment segment) {
    final String dataSource = segment.getDataSource();
    // compute() is used to ensure that the operation for a data source is executed atomically
    dataSources.compute(dataSource, (dataSourceName, dataSourceState) -> {
        if (dataSourceState == null) {
            log.info("Told to delete a queryable for a dataSource[%s] that doesn't exist.", dataSourceName);
            return null;
        } else {
            final VersionedIntervalTimeline<String, ReferenceCountingSegment> loadedIntervals = dataSourceState.getTimeline();
            final ShardSpec shardSpec = segment.getShardSpec();
            final PartitionChunk<ReferenceCountingSegment> removed = loadedIntervals.remove(segment.getInterval(), segment.getVersion(), // partitionChunk. Note that partitionChunk.equals() checks only the partitionNum, but not the object.
            segment.getShardSpec().createChunk(ReferenceCountingSegment.wrapSegment(null, shardSpec)));
            final ReferenceCountingSegment oldQueryable = (removed == null) ? null : removed.getObject();
            if (oldQueryable != null) {
                try (final Closer closer = Closer.create()) {
                    dataSourceState.removeSegment(segment);
                    closer.register(oldQueryable);
                    log.info("Attempting to close segment %s", segment.getId());
                    final ReferenceCountingIndexedTable oldTable = dataSourceState.tablesLookup.remove(segment.getId());
                    if (oldTable != null) {
                        closer.register(oldTable);
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            } else {
                log.info("Told to delete a queryable on dataSource[%s] for interval[%s] and version[%s] that I don't have.", dataSourceName, segment.getInterval(), segment.getVersion());
            }
            // Returning null removes the entry of dataSource from the map
            return dataSourceState.isEmpty() ? null : dataSourceState;
        }
    });
    segmentLoader.cleanup(segment);
}
Also used : ReferenceCountingSegment(org.apache.druid.segment.ReferenceCountingSegment) Closer(org.apache.druid.java.util.common.io.Closer) ReferenceCountingIndexedTable(org.apache.druid.segment.join.table.ReferenceCountingIndexedTable) IOException(java.io.IOException) ShardSpec(org.apache.druid.timeline.partition.ShardSpec)

Example 15 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class CachingLocalSegmentAllocator method allocate.

@Override
public SegmentIdWithShardSpec allocate(InputRow row, String sequenceName, String previousSegmentId, boolean skipSegmentLineageCheck) {
    return sequenceNameToSegmentId.computeIfAbsent(sequenceName, k -> {
        final Pair<Interval, BucketNumberedShardSpec> pair = Preconditions.checkNotNull(sequenceNameToBucket.get(sequenceName), "Missing bucket for sequence[%s]", sequenceName);
        final Interval interval = pair.lhs;
        // Determines the partitionId if this segment allocator is used by the single-threaded task.
        // In parallel ingestion, the partitionId is determined in the supervisor task.
        // See ParallelIndexSupervisorTask.groupGenericPartitionLocationsPerPartition().
        // This code... isn't pretty, but should be simple enough to understand.
        final ShardSpec shardSpec = isParallel ? pair.rhs : pair.rhs.convert(intervalToNextPartitionId.computeInt(interval, (i, nextPartitionId) -> nextPartitionId == null ? 0 : nextPartitionId + 1));
        final String version = versionFinder.apply(interval);
        return new SegmentIdWithShardSpec(dataSource, interval, version, shardSpec);
    });
}
Also used : BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) Interval(org.joda.time.Interval)

Aggregations

ShardSpec (org.apache.druid.timeline.partition.ShardSpec)20 Interval (org.joda.time.Interval)13 ArrayList (java.util.ArrayList)8 DataSegment (org.apache.druid.timeline.DataSegment)8 NumberedShardSpec (org.apache.druid.timeline.partition.NumberedShardSpec)8 List (java.util.List)7 HashBasedNumberedShardSpec (org.apache.druid.timeline.partition.HashBasedNumberedShardSpec)6 SingleDimensionShardSpec (org.apache.druid.timeline.partition.SingleDimensionShardSpec)6 Test (org.junit.Test)6 ImmutableList (com.google.common.collect.ImmutableList)5 HashMap (java.util.HashMap)5 ImmutableMap (com.google.common.collect.ImmutableMap)3 Map (java.util.Map)3 TreeMap (java.util.TreeMap)3 ISE (org.apache.druid.java.util.common.ISE)3 BucketNumberedShardSpec (org.apache.druid.timeline.partition.BucketNumberedShardSpec)3 DateTime (org.joda.time.DateTime)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 IOException (java.io.IOException)2 Collectors (java.util.stream.Collectors)2