use of org.apache.druid.timeline.partition.BucketNumberedShardSpec in project druid by druid-io.
the class SegmentPublisherHelper method annotateShardSpec.
/**
* This method fills missing information in the shard spec if necessary when publishing segments.
*
* - When time chunk lock is used, the non-appending task should set the proper size of the core partitions for
* dynamically-partitioned segments. See {@link #annotateCorePartitionSetSizeFn}.
* - When segment lock is used, the overwriting task should set the proper size of the atomic update group.
* See {@link #annotateAtomicUpdateGroupFn}.
*/
static Set<DataSegment> annotateShardSpec(Set<DataSegment> segments) {
final Map<Interval, List<DataSegment>> intervalToSegments = new HashMap<>();
segments.forEach(segment -> intervalToSegments.computeIfAbsent(segment.getInterval(), k -> new ArrayList<>()).add(segment));
for (Entry<Interval, List<DataSegment>> entry : intervalToSegments.entrySet()) {
final Interval interval = entry.getKey();
final List<DataSegment> segmentsPerInterval = entry.getValue();
final ShardSpec firstShardSpec = segmentsPerInterval.get(0).getShardSpec();
final boolean anyMismatch = segmentsPerInterval.stream().anyMatch(segment -> segment.getShardSpec().getClass() != firstShardSpec.getClass());
if (anyMismatch) {
throw new ISE("Mismatched shardSpecs in interval[%s] for segments[%s]", interval, segmentsPerInterval);
}
final Function<DataSegment, DataSegment> annotateFn;
if (firstShardSpec instanceof OverwriteShardSpec) {
annotateFn = annotateAtomicUpdateGroupFn(segmentsPerInterval.size());
} else if (firstShardSpec instanceof BuildingShardSpec) {
// sanity check
// BuildingShardSpec is used in non-appending mode. In this mode,
// the segments in each interval should have contiguous partitionIds,
// so that they can be queryable (see PartitionHolder.isComplete()).
int expectedCorePartitionSetSize = segmentsPerInterval.size();
int actualCorePartitionSetSize = Math.toIntExact(segmentsPerInterval.stream().filter(segment -> segment.getShardSpec().getPartitionNum() < expectedCorePartitionSetSize).count());
if (expectedCorePartitionSetSize != actualCorePartitionSetSize) {
LOG.errorSegments(segmentsPerInterval, "Cannot publish segments due to incomplete time chunk");
throw new ISE("Cannot publish segments due to incomplete time chunk for interval[%s]. " + "Expected [%s] segments in the core partition, but only [%] segments are found. " + "See task logs for more details about these segments.", interval, expectedCorePartitionSetSize, actualCorePartitionSetSize);
}
annotateFn = annotateCorePartitionSetSizeFn(expectedCorePartitionSetSize);
} else if (firstShardSpec instanceof BucketNumberedShardSpec) {
throw new ISE("Cannot publish segments with shardSpec[%s]", firstShardSpec);
} else {
annotateFn = null;
}
if (annotateFn != null) {
intervalToSegments.put(interval, segmentsPerInterval.stream().map(annotateFn).collect(Collectors.toList()));
}
}
return intervalToSegments.values().stream().flatMap(Collection::stream).collect(Collectors.toSet());
}
use of org.apache.druid.timeline.partition.BucketNumberedShardSpec in project druid by druid-io.
the class LocalIntermediaryDataManager method addSegment.
/**
* Write a segment into one of configured locations. The location to write is chosen in a round-robin manner per
* supervisorTaskId.
*/
@Override
public DataSegment addSegment(String supervisorTaskId, String subTaskId, DataSegment segment, File segmentDir) throws IOException {
// Get or create the location iterator for supervisorTask.
final Iterator<StorageLocation> iterator = locationIterators.computeIfAbsent(supervisorTaskId, k -> {
final Iterator<StorageLocation> cyclicIterator = Iterators.cycle(shuffleDataLocations);
// Random start of the iterator
final int random = ThreadLocalRandom.current().nextInt(shuffleDataLocations.size());
IntStream.range(0, random).forEach(i -> cyclicIterator.next());
return cyclicIterator;
});
// Create a zipped segment in a temp directory.
final File taskTempDir = taskConfig.getTaskTempDir(subTaskId);
final Closer closer = Closer.create();
closer.register(() -> {
try {
org.apache.commons.io.FileUtils.forceDelete(taskTempDir);
} catch (IOException e) {
LOG.warn(e, "Failed to delete directory[%s]", taskTempDir.getAbsolutePath());
}
});
if (!(segment.getShardSpec() instanceof BucketNumberedShardSpec)) {
throw new IAE("Invalid shardSpec type. Expected [%s] but got [%s]", BucketNumberedShardSpec.class.getName(), segment.getShardSpec().getClass().getName());
}
final BucketNumberedShardSpec<?> bucketNumberedShardSpec = (BucketNumberedShardSpec<?>) segment.getShardSpec();
// noinspection unused
try (final Closer resourceCloser = closer) {
FileUtils.mkdirp(taskTempDir);
// Tempary compressed file. Will be removed when taskTempDir is deleted.
final File tempZippedFile = new File(taskTempDir, segment.getId().toString());
final long unzippedSizeBytes = CompressionUtils.zip(segmentDir, tempZippedFile);
if (unzippedSizeBytes == 0) {
throw new IOE("Read 0 bytes from segmentDir[%s]", segmentDir.getAbsolutePath());
}
// Try copying the zipped segment to one of storage locations
for (int i = 0; i < shuffleDataLocations.size(); i++) {
final StorageLocation location = iterator.next();
final String partitionFilePath = getPartitionFilePath(supervisorTaskId, subTaskId, segment.getInterval(), // we must use the bucket ID instead of partition ID
bucketNumberedShardSpec.getBucketId());
final File destFile = location.reserve(partitionFilePath, segment.getId().toString(), tempZippedFile.length());
if (destFile != null) {
try {
FileUtils.mkdirp(destFile.getParentFile());
FileUtils.writeAtomically(destFile, out -> Files.asByteSource(tempZippedFile).copyTo(out));
LOG.info("Wrote intermediary segment[%s] for subtask[%s] at [%s]", segment.getId(), subTaskId, destFile);
return segment.withSize(unzippedSizeBytes).withBinaryVersion(SegmentUtils.getVersionFromDir(segmentDir));
} catch (Exception e) {
location.release(partitionFilePath, tempZippedFile.length());
org.apache.commons.io.FileUtils.deleteQuietly(destFile);
LOG.warn(e, "Failed to write segment[%s] at [%s]. Trying again with the next location", segment.getId(), destFile);
}
}
}
throw new ISE("Can't find location to handle segment[%s]", segment);
}
}
use of org.apache.druid.timeline.partition.BucketNumberedShardSpec in project druid by druid-io.
the class ShardSpecsTest method testShardSpecSelectionWithNullPartitionDimension.
@Test
public void testShardSpecSelectionWithNullPartitionDimension() {
HashBucketShardSpec spec1 = new HashBucketShardSpec(0, 2, null, HashPartitionFunction.MURMUR3_32_ABS, jsonMapper);
HashBucketShardSpec spec2 = new HashBucketShardSpec(1, 2, null, HashPartitionFunction.MURMUR3_32_ABS, jsonMapper);
Map<Interval, List<BucketNumberedShardSpec<?>>> shardSpecMap = new HashMap<>();
shardSpecMap.put(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), ImmutableList.of(spec1, spec2));
ShardSpecs shardSpecs = new ShardSpecs(shardSpecMap, Granularities.HOUR);
String visitorId = "visitorId";
String clientType = "clientType";
long timestamp1 = DateTimes.of("2014-01-01T00:00:00.000Z").getMillis();
InputRow row1 = new MapBasedInputRow(timestamp1, Lists.newArrayList(visitorId, clientType), ImmutableMap.of(visitorId, "0", clientType, "iphone"));
long timestamp2 = DateTimes.of("2014-01-01T00:30:20.456Z").getMillis();
InputRow row2 = new MapBasedInputRow(timestamp2, Lists.newArrayList(visitorId, clientType), ImmutableMap.of(visitorId, "0", clientType, "iphone"));
long timestamp3 = DateTimes.of("2014-01-01T10:10:20.456Z").getMillis();
InputRow row3 = new MapBasedInputRow(timestamp3, Lists.newArrayList(visitorId, clientType), ImmutableMap.of(visitorId, "0", clientType, "iphone"));
ShardSpec spec3 = shardSpecs.getShardSpec(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), row1);
ShardSpec spec4 = shardSpecs.getShardSpec(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), row2);
ShardSpec spec5 = shardSpecs.getShardSpec(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), row3);
Assert.assertSame(true, spec3 == spec4);
Assert.assertSame(false, spec3 == spec5);
}
use of org.apache.druid.timeline.partition.BucketNumberedShardSpec in project druid by druid-io.
the class CachingLocalSegmentAllocator method allocate.
@Override
public SegmentIdWithShardSpec allocate(InputRow row, String sequenceName, String previousSegmentId, boolean skipSegmentLineageCheck) {
return sequenceNameToSegmentId.computeIfAbsent(sequenceName, k -> {
final Pair<Interval, BucketNumberedShardSpec> pair = Preconditions.checkNotNull(sequenceNameToBucket.get(sequenceName), "Missing bucket for sequence[%s]", sequenceName);
final Interval interval = pair.lhs;
// Determines the partitionId if this segment allocator is used by the single-threaded task.
// In parallel ingestion, the partitionId is determined in the supervisor task.
// See ParallelIndexSupervisorTask.groupGenericPartitionLocationsPerPartition().
// This code... isn't pretty, but should be simple enough to understand.
final ShardSpec shardSpec = isParallel ? pair.rhs : pair.rhs.convert(intervalToNextPartitionId.computeInt(interval, (i, nextPartitionId) -> nextPartitionId == null ? 0 : nextPartitionId + 1));
final String version = versionFinder.apply(interval);
return new SegmentIdWithShardSpec(dataSource, interval, version, shardSpec);
});
}
Aggregations