use of org.apache.druid.timeline.partition.ShardSpec in project hive by apache.
the class DruidStorageHandlerUtils method publishSegmentsAndCommit.
/**
* First computes the segments timeline to accommodate new segments for insert into case.
* Then moves segments to druid deep storage with updated metadata/version.
* ALL IS DONE IN ONE TRANSACTION
*
* @param connector DBI connector to commit
* @param metadataStorageTablesConfig Druid metadata tables definitions
* @param dataSource Druid datasource name
* @param segments List of segments to move and commit to metadata
* @param overwrite if it is an insert overwrite
* @param conf Configuration
* @param dataSegmentPusher segment pusher
* @return List of successfully published Druid segments.
* This list has the updated versions and metadata about segments after move and timeline sorting
* @throws CallbackFailedException in case the connector can not add the segment to the DB.
*/
@SuppressWarnings("unchecked")
static List<DataSegment> publishSegmentsAndCommit(final SQLMetadataConnector connector, final MetadataStorageTablesConfig metadataStorageTablesConfig, final String dataSource, final List<DataSegment> segments, boolean overwrite, Configuration conf, DataSegmentPusher dataSegmentPusher) throws CallbackFailedException {
return connector.getDBI().inTransaction((handle, transactionStatus) -> {
// We create the timeline for the existing and new segments
VersionedIntervalTimeline<String, DataSegment> timeline;
if (overwrite) {
// If we are overwriting, we disable existing sources
disableDataSourceWithHandle(handle, metadataStorageTablesConfig, dataSource);
// When overwriting, we just start with empty timeline,
// as we are overwriting segments with new versions
timeline = new VersionedIntervalTimeline<>(Ordering.natural());
} else {
// Append Mode
if (segments.isEmpty()) {
// If there are no new segments, we can just bail out
return Collections.EMPTY_LIST;
}
// Otherwise, build a timeline of existing segments in metadata storage
Interval indexedInterval = JodaUtils.umbrellaInterval(segments.stream().map(DataSegment::getInterval).collect(Collectors.toList()));
LOG.info("Building timeline for umbrella Interval [{}]", indexedInterval);
timeline = getTimelineForIntervalWithHandle(handle, dataSource, indexedInterval, metadataStorageTablesConfig);
}
final List<DataSegment> finalSegmentsToPublish = Lists.newArrayList();
for (DataSegment segment : segments) {
List<TimelineObjectHolder<String, DataSegment>> existingChunks = timeline.lookup(segment.getInterval());
if (existingChunks.size() > 1) {
// Druid shard specs does not support multiple partitions for same interval with different granularity.
throw new IllegalStateException(String.format("Cannot allocate new segment for dataSource[%s], interval[%s], already have [%,d] chunks. " + "Not possible to append new segment.", dataSource, segment.getInterval(), existingChunks.size()));
}
// Find out the segment with latest version and maximum partition number
SegmentIdWithShardSpec max = null;
final ShardSpec newShardSpec;
final String newVersion;
if (!existingChunks.isEmpty()) {
// Some existing chunk, Find max
TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
for (PartitionChunk<DataSegment> existing : existingHolder.getObject()) {
if (max == null || max.getShardSpec().getPartitionNum() < existing.getObject().getShardSpec().getPartitionNum()) {
max = SegmentIdWithShardSpec.fromDataSegment(existing.getObject());
}
}
}
if (max == null) {
// No existing shard present in the database, use the current version.
newShardSpec = segment.getShardSpec();
newVersion = segment.getVersion();
} else {
// use version of existing max segment to generate new shard spec
newShardSpec = getNextPartitionShardSpec(max.getShardSpec());
newVersion = max.getVersion();
}
DataSegment publishedSegment = publishSegmentWithShardSpec(segment, newShardSpec, newVersion, getPath(segment).getFileSystem(conf), dataSegmentPusher);
finalSegmentsToPublish.add(publishedSegment);
timeline.add(publishedSegment.getInterval(), publishedSegment.getVersion(), publishedSegment.getShardSpec().createChunk(publishedSegment));
}
// Publish new segments to metadata storage
final PreparedBatch batch = handle.prepareBatch(String.format("INSERT INTO %1$s (id, dataSource, created_date, start, \"end\", partitioned, version, used, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :partitioned, :version, :used, :payload)", metadataStorageTablesConfig.getSegmentsTable()));
for (final DataSegment segment : finalSegmentsToPublish) {
batch.add(new ImmutableMap.Builder<String, Object>().put("id", segment.getId().toString()).put("dataSource", segment.getDataSource()).put("created_date", new DateTime().toString()).put("start", segment.getInterval().getStart().toString()).put("end", segment.getInterval().getEnd().toString()).put("partitioned", !(segment.getShardSpec() instanceof NoneShardSpec)).put("version", segment.getVersion()).put("used", true).put("payload", JSON_MAPPER.writeValueAsBytes(segment)).build());
LOG.info("Published {}", segment.getId().toString());
}
batch.execute();
return finalSegmentsToPublish;
});
}
use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.
the class NewestSegmentFirstPolicyBenchmark method setup.
@Setup(Level.Trial)
public void setup() {
compactionConfigs = new HashMap<>();
for (int i = 0; i < numDataSources; i++) {
final String dataSource = DATA_SOURCE_PREFIX + i;
compactionConfigs.put(dataSource, new DataSourceCompactionConfig(dataSource, 0, inputSegmentSizeBytes, null, null, null, null, null, null, null, null, null));
}
List<DataSegment> segments = new ArrayList<>();
for (int i = 0; i < numDataSources; i++) {
final String dataSource = DATA_SOURCE_PREFIX + i;
final int startYear = ThreadLocalRandom.current().nextInt(2000, 2040);
DateTime date = DateTimes.of(startYear, 1, 1, 0, 0);
for (int j = 0; j < numDayIntervalsPerDataSource; j++, date = date.plusDays(1)) {
for (int k = 0; k < numPartitionsPerDayInterval; k++) {
final ShardSpec shardSpec = new NumberedShardSpec(numPartitionsPerDayInterval, k);
final DataSegment segment = new DataSegment(dataSource, new Interval(date, date.plusDays(1)), "version", null, ImmutableList.of(), ImmutableList.of(), shardSpec, 0, segmentSizeBytes);
segments.add(segment);
}
}
}
dataSources = DataSourcesSnapshot.fromUsedSegments(segments, ImmutableMap.of()).getUsedSegmentsTimelinesPerDataSource();
}
use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.
the class HadoopDruidIndexerConfig method getBucket.
/**
******************************************
* Granularity/Bucket Helper Methods
*******************************************
*/
/**
* Get the proper bucket for some input row.
*
* @param inputRow an InputRow
* @return the Bucket that this row belongs to
*/
Optional<Bucket> getBucket(InputRow inputRow) {
final Optional<Interval> timeBucket = schema.getDataSchema().getGranularitySpec().bucketInterval(DateTimes.utc(inputRow.getTimestampFromEpoch()));
if (!timeBucket.isPresent()) {
return Optional.absent();
}
final DateTime bucketStart = timeBucket.get().getStart();
final ShardSpec actualSpec = shardSpecLookups.get(bucketStart.getMillis()).getShardSpec(rollupGran.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
final HadoopyShardSpec hadoopyShardSpec = hadoopShardSpecLookup.get(bucketStart.getMillis()).get(actualSpec);
return Optional.of(new Bucket(hadoopyShardSpec.getShardNum(), bucketStart, actualSpec.getPartitionNum()));
}
use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.
the class SegmentManager method dropSegment.
public void dropSegment(final DataSegment segment) {
final String dataSource = segment.getDataSource();
// compute() is used to ensure that the operation for a data source is executed atomically
dataSources.compute(dataSource, (dataSourceName, dataSourceState) -> {
if (dataSourceState == null) {
log.info("Told to delete a queryable for a dataSource[%s] that doesn't exist.", dataSourceName);
return null;
} else {
final VersionedIntervalTimeline<String, ReferenceCountingSegment> loadedIntervals = dataSourceState.getTimeline();
final ShardSpec shardSpec = segment.getShardSpec();
final PartitionChunk<ReferenceCountingSegment> removed = loadedIntervals.remove(segment.getInterval(), segment.getVersion(), // partitionChunk. Note that partitionChunk.equals() checks only the partitionNum, but not the object.
segment.getShardSpec().createChunk(ReferenceCountingSegment.wrapSegment(null, shardSpec)));
final ReferenceCountingSegment oldQueryable = (removed == null) ? null : removed.getObject();
if (oldQueryable != null) {
try (final Closer closer = Closer.create()) {
dataSourceState.removeSegment(segment);
closer.register(oldQueryable);
log.info("Attempting to close segment %s", segment.getId());
final ReferenceCountingIndexedTable oldTable = dataSourceState.tablesLookup.remove(segment.getId());
if (oldTable != null) {
closer.register(oldTable);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
} else {
log.info("Told to delete a queryable on dataSource[%s] for interval[%s] and version[%s] that I don't have.", dataSourceName, segment.getInterval(), segment.getVersion());
}
// Returning null removes the entry of dataSource from the map
return dataSourceState.isEmpty() ? null : dataSourceState;
}
});
segmentLoader.cleanup(segment);
}
use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.
the class CachingLocalSegmentAllocator method allocate.
@Override
public SegmentIdWithShardSpec allocate(InputRow row, String sequenceName, String previousSegmentId, boolean skipSegmentLineageCheck) {
return sequenceNameToSegmentId.computeIfAbsent(sequenceName, k -> {
final Pair<Interval, BucketNumberedShardSpec> pair = Preconditions.checkNotNull(sequenceNameToBucket.get(sequenceName), "Missing bucket for sequence[%s]", sequenceName);
final Interval interval = pair.lhs;
// Determines the partitionId if this segment allocator is used by the single-threaded task.
// In parallel ingestion, the partitionId is determined in the supervisor task.
// See ParallelIndexSupervisorTask.groupGenericPartitionLocationsPerPartition().
// This code... isn't pretty, but should be simple enough to understand.
final ShardSpec shardSpec = isParallel ? pair.rhs : pair.rhs.convert(intervalToNextPartitionId.computeInt(interval, (i, nextPartitionId) -> nextPartitionId == null ? 0 : nextPartitionId + 1));
final String version = versionFinder.apply(interval);
return new SegmentIdWithShardSpec(dataSource, interval, version, shardSpec);
});
}
Aggregations