Search in sources :

Example 1 with BucketNumberedShardSpec

use of org.apache.druid.timeline.partition.BucketNumberedShardSpec in project druid by druid-io.

the class SegmentPublisherHelper method annotateShardSpec.

/**
 * This method fills missing information in the shard spec if necessary when publishing segments.
 *
 * - When time chunk lock is used, the non-appending task should set the proper size of the core partitions for
 *   dynamically-partitioned segments. See {@link #annotateCorePartitionSetSizeFn}.
 * - When segment lock is used, the overwriting task should set the proper size of the atomic update group.
 *   See {@link #annotateAtomicUpdateGroupFn}.
 */
static Set<DataSegment> annotateShardSpec(Set<DataSegment> segments) {
    final Map<Interval, List<DataSegment>> intervalToSegments = new HashMap<>();
    segments.forEach(segment -> intervalToSegments.computeIfAbsent(segment.getInterval(), k -> new ArrayList<>()).add(segment));
    for (Entry<Interval, List<DataSegment>> entry : intervalToSegments.entrySet()) {
        final Interval interval = entry.getKey();
        final List<DataSegment> segmentsPerInterval = entry.getValue();
        final ShardSpec firstShardSpec = segmentsPerInterval.get(0).getShardSpec();
        final boolean anyMismatch = segmentsPerInterval.stream().anyMatch(segment -> segment.getShardSpec().getClass() != firstShardSpec.getClass());
        if (anyMismatch) {
            throw new ISE("Mismatched shardSpecs in interval[%s] for segments[%s]", interval, segmentsPerInterval);
        }
        final Function<DataSegment, DataSegment> annotateFn;
        if (firstShardSpec instanceof OverwriteShardSpec) {
            annotateFn = annotateAtomicUpdateGroupFn(segmentsPerInterval.size());
        } else if (firstShardSpec instanceof BuildingShardSpec) {
            // sanity check
            // BuildingShardSpec is used in non-appending mode. In this mode,
            // the segments in each interval should have contiguous partitionIds,
            // so that they can be queryable (see PartitionHolder.isComplete()).
            int expectedCorePartitionSetSize = segmentsPerInterval.size();
            int actualCorePartitionSetSize = Math.toIntExact(segmentsPerInterval.stream().filter(segment -> segment.getShardSpec().getPartitionNum() < expectedCorePartitionSetSize).count());
            if (expectedCorePartitionSetSize != actualCorePartitionSetSize) {
                LOG.errorSegments(segmentsPerInterval, "Cannot publish segments due to incomplete time chunk");
                throw new ISE("Cannot publish segments due to incomplete time chunk for interval[%s]. " + "Expected [%s] segments in the core partition, but only [%] segments are found. " + "See task logs for more details about these segments.", interval, expectedCorePartitionSetSize, actualCorePartitionSetSize);
            }
            annotateFn = annotateCorePartitionSetSizeFn(expectedCorePartitionSetSize);
        } else if (firstShardSpec instanceof BucketNumberedShardSpec) {
            throw new ISE("Cannot publish segments with shardSpec[%s]", firstShardSpec);
        } else {
            annotateFn = null;
        }
        if (annotateFn != null) {
            intervalToSegments.put(interval, segmentsPerInterval.stream().map(annotateFn).collect(Collectors.toList()));
        }
    }
    return intervalToSegments.values().stream().flatMap(Collection::stream).collect(Collectors.toSet());
}
Also used : Logger(org.apache.druid.java.util.common.logger.Logger) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) Collection(java.util.Collection) OverwriteShardSpec(org.apache.druid.timeline.partition.OverwriteShardSpec) Set(java.util.Set) ISE(org.apache.druid.java.util.common.ISE) HashMap(java.util.HashMap) Function(java.util.function.Function) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) Interval(org.joda.time.Interval) List(java.util.List) Map(java.util.Map) DataSegment(org.apache.druid.timeline.DataSegment) Entry(java.util.Map.Entry) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) HashMap(java.util.HashMap) DataSegment(org.apache.druid.timeline.DataSegment) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) OverwriteShardSpec(org.apache.druid.timeline.partition.OverwriteShardSpec) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) ArrayList(java.util.ArrayList) List(java.util.List) ISE(org.apache.druid.java.util.common.ISE) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) OverwriteShardSpec(org.apache.druid.timeline.partition.OverwriteShardSpec) Interval(org.joda.time.Interval)

Example 2 with BucketNumberedShardSpec

use of org.apache.druid.timeline.partition.BucketNumberedShardSpec in project druid by druid-io.

the class LocalIntermediaryDataManager method addSegment.

/**
 * Write a segment into one of configured locations. The location to write is chosen in a round-robin manner per
 * supervisorTaskId.
 */
@Override
public DataSegment addSegment(String supervisorTaskId, String subTaskId, DataSegment segment, File segmentDir) throws IOException {
    // Get or create the location iterator for supervisorTask.
    final Iterator<StorageLocation> iterator = locationIterators.computeIfAbsent(supervisorTaskId, k -> {
        final Iterator<StorageLocation> cyclicIterator = Iterators.cycle(shuffleDataLocations);
        // Random start of the iterator
        final int random = ThreadLocalRandom.current().nextInt(shuffleDataLocations.size());
        IntStream.range(0, random).forEach(i -> cyclicIterator.next());
        return cyclicIterator;
    });
    // Create a zipped segment in a temp directory.
    final File taskTempDir = taskConfig.getTaskTempDir(subTaskId);
    final Closer closer = Closer.create();
    closer.register(() -> {
        try {
            org.apache.commons.io.FileUtils.forceDelete(taskTempDir);
        } catch (IOException e) {
            LOG.warn(e, "Failed to delete directory[%s]", taskTempDir.getAbsolutePath());
        }
    });
    if (!(segment.getShardSpec() instanceof BucketNumberedShardSpec)) {
        throw new IAE("Invalid shardSpec type. Expected [%s] but got [%s]", BucketNumberedShardSpec.class.getName(), segment.getShardSpec().getClass().getName());
    }
    final BucketNumberedShardSpec<?> bucketNumberedShardSpec = (BucketNumberedShardSpec<?>) segment.getShardSpec();
    // noinspection unused
    try (final Closer resourceCloser = closer) {
        FileUtils.mkdirp(taskTempDir);
        // Tempary compressed file. Will be removed when taskTempDir is deleted.
        final File tempZippedFile = new File(taskTempDir, segment.getId().toString());
        final long unzippedSizeBytes = CompressionUtils.zip(segmentDir, tempZippedFile);
        if (unzippedSizeBytes == 0) {
            throw new IOE("Read 0 bytes from segmentDir[%s]", segmentDir.getAbsolutePath());
        }
        // Try copying the zipped segment to one of storage locations
        for (int i = 0; i < shuffleDataLocations.size(); i++) {
            final StorageLocation location = iterator.next();
            final String partitionFilePath = getPartitionFilePath(supervisorTaskId, subTaskId, segment.getInterval(), // we must use the bucket ID instead of partition ID
            bucketNumberedShardSpec.getBucketId());
            final File destFile = location.reserve(partitionFilePath, segment.getId().toString(), tempZippedFile.length());
            if (destFile != null) {
                try {
                    FileUtils.mkdirp(destFile.getParentFile());
                    FileUtils.writeAtomically(destFile, out -> Files.asByteSource(tempZippedFile).copyTo(out));
                    LOG.info("Wrote intermediary segment[%s] for subtask[%s] at [%s]", segment.getId(), subTaskId, destFile);
                    return segment.withSize(unzippedSizeBytes).withBinaryVersion(SegmentUtils.getVersionFromDir(segmentDir));
                } catch (Exception e) {
                    location.release(partitionFilePath, tempZippedFile.length());
                    org.apache.commons.io.FileUtils.deleteQuietly(destFile);
                    LOG.warn(e, "Failed to write segment[%s] at [%s]. Trying again with the next location", segment.getId(), destFile);
                }
            }
        }
        throw new ISE("Can't find location to handle segment[%s]", segment);
    }
}
Also used : Closer(org.apache.druid.java.util.common.io.Closer) IOException(java.io.IOException) IAE(org.apache.druid.java.util.common.IAE) IOException(java.io.IOException) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) ISE(org.apache.druid.java.util.common.ISE) StorageLocation(org.apache.druid.segment.loading.StorageLocation) File(java.io.File) IOE(org.apache.druid.java.util.common.IOE)

Example 3 with BucketNumberedShardSpec

use of org.apache.druid.timeline.partition.BucketNumberedShardSpec in project druid by druid-io.

the class ShardSpecsTest method testShardSpecSelectionWithNullPartitionDimension.

@Test
public void testShardSpecSelectionWithNullPartitionDimension() {
    HashBucketShardSpec spec1 = new HashBucketShardSpec(0, 2, null, HashPartitionFunction.MURMUR3_32_ABS, jsonMapper);
    HashBucketShardSpec spec2 = new HashBucketShardSpec(1, 2, null, HashPartitionFunction.MURMUR3_32_ABS, jsonMapper);
    Map<Interval, List<BucketNumberedShardSpec<?>>> shardSpecMap = new HashMap<>();
    shardSpecMap.put(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), ImmutableList.of(spec1, spec2));
    ShardSpecs shardSpecs = new ShardSpecs(shardSpecMap, Granularities.HOUR);
    String visitorId = "visitorId";
    String clientType = "clientType";
    long timestamp1 = DateTimes.of("2014-01-01T00:00:00.000Z").getMillis();
    InputRow row1 = new MapBasedInputRow(timestamp1, Lists.newArrayList(visitorId, clientType), ImmutableMap.of(visitorId, "0", clientType, "iphone"));
    long timestamp2 = DateTimes.of("2014-01-01T00:30:20.456Z").getMillis();
    InputRow row2 = new MapBasedInputRow(timestamp2, Lists.newArrayList(visitorId, clientType), ImmutableMap.of(visitorId, "0", clientType, "iphone"));
    long timestamp3 = DateTimes.of("2014-01-01T10:10:20.456Z").getMillis();
    InputRow row3 = new MapBasedInputRow(timestamp3, Lists.newArrayList(visitorId, clientType), ImmutableMap.of(visitorId, "0", clientType, "iphone"));
    ShardSpec spec3 = shardSpecs.getShardSpec(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), row1);
    ShardSpec spec4 = shardSpecs.getShardSpec(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), row2);
    ShardSpec spec5 = shardSpecs.getShardSpec(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), row3);
    Assert.assertSame(true, spec3 == spec4);
    Assert.assertSame(false, spec3 == spec5);
}
Also used : HashMap(java.util.HashMap) HashBucketShardSpec(org.apache.druid.timeline.partition.HashBucketShardSpec) MapBasedInputRow(org.apache.druid.data.input.MapBasedInputRow) InputRow(org.apache.druid.data.input.InputRow) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) MapBasedInputRow(org.apache.druid.data.input.MapBasedInputRow) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) HashBucketShardSpec(org.apache.druid.timeline.partition.HashBucketShardSpec) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 4 with BucketNumberedShardSpec

use of org.apache.druid.timeline.partition.BucketNumberedShardSpec in project druid by druid-io.

the class CachingLocalSegmentAllocator method allocate.

@Override
public SegmentIdWithShardSpec allocate(InputRow row, String sequenceName, String previousSegmentId, boolean skipSegmentLineageCheck) {
    return sequenceNameToSegmentId.computeIfAbsent(sequenceName, k -> {
        final Pair<Interval, BucketNumberedShardSpec> pair = Preconditions.checkNotNull(sequenceNameToBucket.get(sequenceName), "Missing bucket for sequence[%s]", sequenceName);
        final Interval interval = pair.lhs;
        // Determines the partitionId if this segment allocator is used by the single-threaded task.
        // In parallel ingestion, the partitionId is determined in the supervisor task.
        // See ParallelIndexSupervisorTask.groupGenericPartitionLocationsPerPartition().
        // This code... isn't pretty, but should be simple enough to understand.
        final ShardSpec shardSpec = isParallel ? pair.rhs : pair.rhs.convert(intervalToNextPartitionId.computeInt(interval, (i, nextPartitionId) -> nextPartitionId == null ? 0 : nextPartitionId + 1));
        final String version = versionFinder.apply(interval);
        return new SegmentIdWithShardSpec(dataSource, interval, version, shardSpec);
    });
}
Also used : BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) Interval(org.joda.time.Interval)

Aggregations

BucketNumberedShardSpec (org.apache.druid.timeline.partition.BucketNumberedShardSpec)4 ShardSpec (org.apache.druid.timeline.partition.ShardSpec)3 Interval (org.joda.time.Interval)3 HashMap (java.util.HashMap)2 List (java.util.List)2 ISE (org.apache.druid.java.util.common.ISE)2 ImmutableList (com.google.common.collect.ImmutableList)1 File (java.io.File)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 Map (java.util.Map)1 Entry (java.util.Map.Entry)1 Set (java.util.Set)1 Function (java.util.function.Function)1 Collectors (java.util.stream.Collectors)1 InputRow (org.apache.druid.data.input.InputRow)1 MapBasedInputRow (org.apache.druid.data.input.MapBasedInputRow)1 IAE (org.apache.druid.java.util.common.IAE)1 IOE (org.apache.druid.java.util.common.IOE)1