use of org.apache.druid.java.util.common.NonnullPair in project druid by druid-io.
the class ListFilteredDimensionSpecDimensionSelectorTest method createDictionaries.
private NonnullPair<Object2IntMap<String>, Int2ObjectMap<String>> createDictionaries(List<List<String>> values) {
Object2IntMap<String> dictionary = new Object2IntOpenHashMap<>();
Int2ObjectMap<String> reverseDictionary = new Int2ObjectOpenHashMap<>();
MutableInt nextId = new MutableInt(0);
for (List<String> multiValue : values) {
for (String value : multiValue) {
int dictId = dictionary.computeIntIfAbsent(value, k -> nextId.getAndIncrement());
reverseDictionary.putIfAbsent(dictId, value);
}
}
return new NonnullPair<>(dictionary, reverseDictionary);
}
use of org.apache.druid.java.util.common.NonnullPair in project druid by druid-io.
the class CompactionTask method createIngestionSchema.
/**
* Generate {@link ParallelIndexIngestionSpec} from input segments.
*
* @return an empty list if input segments don't exist. Otherwise, a generated ingestionSpec.
*/
@VisibleForTesting
static List<ParallelIndexIngestionSpec> createIngestionSchema(final TaskToolbox toolbox, final LockGranularity lockGranularityInUse, final SegmentProvider segmentProvider, final PartitionConfigurationManager partitionConfigurationManager, @Nullable final DimensionsSpec dimensionsSpec, @Nullable final ClientCompactionTaskTransformSpec transformSpec, @Nullable final AggregatorFactory[] metricsSpec, @Nullable final ClientCompactionTaskGranularitySpec granularitySpec, final CoordinatorClient coordinatorClient, final SegmentCacheManagerFactory segmentCacheManagerFactory, final RetryPolicyFactory retryPolicyFactory, final boolean dropExisting) throws IOException, SegmentLoadingException {
NonnullPair<Map<DataSegment, File>, List<TimelineObjectHolder<String, DataSegment>>> pair = prepareSegments(toolbox, segmentProvider, lockGranularityInUse);
final Map<DataSegment, File> segmentFileMap = pair.lhs;
final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = pair.rhs;
if (timelineSegments.size() == 0) {
return Collections.emptyList();
}
// find metadata for interval
// queryableIndexAndSegments is sorted by the interval of the dataSegment
final List<NonnullPair<QueryableIndex, DataSegment>> queryableIndexAndSegments = loadSegments(timelineSegments, segmentFileMap, toolbox.getIndexIO());
final CompactionTuningConfig compactionTuningConfig = partitionConfigurationManager.computeTuningConfig();
if (granularitySpec == null || granularitySpec.getSegmentGranularity() == null) {
// original granularity
final Map<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> intervalToSegments = new TreeMap<>(Comparators.intervalsByStartThenEnd());
queryableIndexAndSegments.forEach(p -> intervalToSegments.computeIfAbsent(p.rhs.getInterval(), k -> new ArrayList<>()).add(p));
// unify overlapping intervals to ensure overlapping segments compacting in the same indexSpec
List<NonnullPair<Interval, List<NonnullPair<QueryableIndex, DataSegment>>>> intervalToSegmentsUnified = new ArrayList<>();
Interval union = null;
List<NonnullPair<QueryableIndex, DataSegment>> segments = new ArrayList<>();
for (Entry<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> entry : intervalToSegments.entrySet()) {
Interval cur = entry.getKey();
if (union == null) {
union = cur;
segments.addAll(entry.getValue());
} else if (union.overlaps(cur)) {
union = Intervals.utc(union.getStartMillis(), Math.max(union.getEndMillis(), cur.getEndMillis()));
segments.addAll(entry.getValue());
} else {
intervalToSegmentsUnified.add(new NonnullPair<>(union, segments));
union = cur;
segments = new ArrayList<>(entry.getValue());
}
}
intervalToSegmentsUnified.add(new NonnullPair<>(union, segments));
final List<ParallelIndexIngestionSpec> specs = new ArrayList<>(intervalToSegmentsUnified.size());
for (NonnullPair<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> entry : intervalToSegmentsUnified) {
final Interval interval = entry.lhs;
final List<NonnullPair<QueryableIndex, DataSegment>> segmentsToCompact = entry.rhs;
// If granularitySpec is not null, then set segmentGranularity. Otherwise,
// creates new granularitySpec and set segmentGranularity
Granularity segmentGranularityToUse = GranularityType.fromPeriod(interval.toPeriod()).getDefaultGranularity();
final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, segmentsToCompact, dimensionsSpec, transformSpec, metricsSpec, granularitySpec == null ? new ClientCompactionTaskGranularitySpec(segmentGranularityToUse, null, null) : granularitySpec.withSegmentGranularity(segmentGranularityToUse));
specs.add(new ParallelIndexIngestionSpec(dataSchema, createIoConfig(toolbox, dataSchema, interval, coordinatorClient, segmentCacheManagerFactory, retryPolicyFactory, dropExisting), compactionTuningConfig));
}
return specs;
} else {
// given segment granularity
final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, queryableIndexAndSegments, dimensionsSpec, transformSpec, metricsSpec, granularitySpec);
return Collections.singletonList(new ParallelIndexIngestionSpec(dataSchema, createIoConfig(toolbox, dataSchema, segmentProvider.interval, coordinatorClient, segmentCacheManagerFactory, retryPolicyFactory, dropExisting), compactionTuningConfig));
}
}
use of org.apache.druid.java.util.common.NonnullPair in project druid by druid-io.
the class CompactionTask method createDimensionsSpec.
private static DimensionsSpec createDimensionsSpec(List<NonnullPair<QueryableIndex, DataSegment>> queryableIndices) {
final BiMap<String, Integer> uniqueDims = HashBiMap.create();
final Map<String, DimensionSchema> dimensionSchemaMap = new HashMap<>();
// Here, we try to retain the order of dimensions as they were specified since the order of dimensions may be
// optimized for performance.
// Dimensions are extracted from the recent segments to olders because recent segments are likely to be queried more
// frequently, and thus the performance should be optimized for recent ones rather than old ones.
// sort timelineSegments in order of interval, see https://github.com/apache/druid/pull/9905
queryableIndices.sort((o1, o2) -> Comparators.intervalsByStartThenEnd().compare(o1.rhs.getInterval(), o2.rhs.getInterval()));
int index = 0;
for (NonnullPair<QueryableIndex, DataSegment> pair : Lists.reverse(queryableIndices)) {
final QueryableIndex queryableIndex = pair.lhs;
final Map<String, DimensionHandler> dimensionHandlerMap = queryableIndex.getDimensionHandlers();
for (String dimension : queryableIndex.getAvailableDimensions()) {
final ColumnHolder columnHolder = Preconditions.checkNotNull(queryableIndex.getColumnHolder(dimension), "Cannot find column for dimension[%s]", dimension);
if (!uniqueDims.containsKey(dimension)) {
final DimensionHandler dimensionHandler = Preconditions.checkNotNull(dimensionHandlerMap.get(dimension), "Cannot find dimensionHandler for dimension[%s]", dimension);
uniqueDims.put(dimension, index++);
dimensionSchemaMap.put(dimension, createDimensionSchema(dimension, columnHolder.getCapabilities(), dimensionHandler.getMultivalueHandling()));
}
}
}
final BiMap<Integer, String> orderedDims = uniqueDims.inverse();
final List<DimensionSchema> dimensionSchemas = IntStream.range(0, orderedDims.size()).mapToObj(i -> {
final String dimName = orderedDims.get(i);
return Preconditions.checkNotNull(dimensionSchemaMap.get(dimName), "Cannot find dimension[%s] from dimensionSchemaMap", dimName);
}).collect(Collectors.toList());
return new DimensionsSpec(dimensionSchemas);
}
use of org.apache.druid.java.util.common.NonnullPair in project druid by druid-io.
the class SinglePhaseParallelIndexTaskRunner method findIntervalAndVersion.
private NonnullPair<Interval, String> findIntervalAndVersion(DateTime timestamp) throws IOException {
final GranularitySpec granularitySpec = getIngestionSchema().getDataSchema().getGranularitySpec();
// This method is called whenever subtasks need to allocate a new segment via the supervisor task.
// As a result, this code is never called in the Overlord. For now using the materialized intervals
// here is ok for performance reasons
final Set<Interval> materializedBucketIntervals = granularitySpec.materializedBucketIntervals();
// List locks whenever allocating a new segment because locks might be revoked and no longer valid.
final List<TaskLock> locks = getToolbox().getTaskActionClient().submit(new LockListAction());
final TaskLock revokedLock = locks.stream().filter(TaskLock::isRevoked).findAny().orElse(null);
if (revokedLock != null) {
throw new ISE("Lock revoked: [%s]", revokedLock);
}
final Map<Interval, String> versions = locks.stream().collect(Collectors.toMap(TaskLock::getInterval, TaskLock::getVersion));
Interval interval;
String version;
if (!materializedBucketIntervals.isEmpty()) {
// If granularity spec has explicit intervals, we just need to find the version associated to the interval.
// This is because we should have gotten all required locks up front when the task starts up.
final Optional<Interval> maybeInterval = granularitySpec.bucketInterval(timestamp);
if (!maybeInterval.isPresent()) {
throw new IAE("Could not find interval for timestamp [%s]", timestamp);
}
interval = maybeInterval.get();
if (!materializedBucketIntervals.contains(interval)) {
throw new ISE("Unspecified interval[%s] in granularitySpec[%s]", interval, granularitySpec);
}
version = ParallelIndexSupervisorTask.findVersion(versions, interval);
if (version == null) {
throw new ISE("Cannot find a version for interval[%s]", interval);
}
} else {
// We don't have explicit intervals. We can use the segment granularity to figure out what
// interval we need, but we might not have already locked it.
interval = granularitySpec.getSegmentGranularity().bucket(timestamp);
version = ParallelIndexSupervisorTask.findVersion(versions, interval);
if (version == null) {
final int maxAllowedLockCount = getIngestionSchema().getTuningConfig().getMaxAllowedLockCount();
if (maxAllowedLockCount >= 0 && locks.size() >= maxAllowedLockCount) {
throw new MaxAllowedLocksExceededException(maxAllowedLockCount);
}
// We don't have a lock for this interval, so we should lock it now.
final TaskLock lock = Preconditions.checkNotNull(getToolbox().getTaskActionClient().submit(new TimeChunkLockTryAcquireAction(TaskLockType.EXCLUSIVE, interval)), "Cannot acquire a lock for interval[%s]", interval);
if (lock.isRevoked()) {
throw new ISE(StringUtils.format("Lock for interval [%s] was revoked.", interval));
}
version = lock.getVersion();
}
}
return new NonnullPair<>(interval, version);
}
use of org.apache.druid.java.util.common.NonnullPair in project druid by druid-io.
the class CompactionTask method loadSegments.
private static List<NonnullPair<QueryableIndex, DataSegment>> loadSegments(List<TimelineObjectHolder<String, DataSegment>> timelineObjectHolders, Map<DataSegment, File> segmentFileMap, IndexIO indexIO) throws IOException {
final List<NonnullPair<QueryableIndex, DataSegment>> segments = new ArrayList<>();
for (TimelineObjectHolder<String, DataSegment> timelineObjectHolder : timelineObjectHolders) {
final PartitionHolder<DataSegment> partitionHolder = timelineObjectHolder.getObject();
for (PartitionChunk<DataSegment> chunk : partitionHolder) {
final DataSegment segment = chunk.getObject();
final QueryableIndex queryableIndex = indexIO.loadIndex(Preconditions.checkNotNull(segmentFileMap.get(segment), "File for segment %s", segment.getId()));
segments.add(new NonnullPair<>(queryableIndex, segment));
}
}
return segments;
}
Aggregations