use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.
the class SegmentAllocateActionTest method testManySegmentsSameInterval.
@Test
public void testManySegmentsSameInterval() throws Exception {
final Task task = new NoopTask(null, 0, 0, null, null, null);
taskActionTestKit.getTaskLockbox().add(task);
final SegmentIdentifier id1 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", null);
final SegmentIdentifier id2 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", id1.getIdentifierAsString());
final SegmentIdentifier id3 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", id2.getIdentifierAsString());
final TaskLock partyLock = Iterables.getOnlyElement(FluentIterable.from(taskActionTestKit.getTaskLockbox().findLocksForTask(task)).filter(new Predicate<TaskLock>() {
@Override
public boolean apply(TaskLock input) {
return input.getInterval().contains(PARTY_TIME);
}
}));
assertSameIdentifier(id1, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), partyLock.getVersion(), new NumberedShardSpec(0, 0)));
assertSameIdentifier(id2, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), partyLock.getVersion(), new NumberedShardSpec(1, 0)));
assertSameIdentifier(id3, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), partyLock.getVersion(), new NumberedShardSpec(2, 0)));
}
use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.
the class SegmentAllocateActionTest method testAddToExistingNumberedShardSpecsSameGranularity.
@Test
public void testAddToExistingNumberedShardSpecsSameGranularity() throws Exception {
final Task task = new NoopTask(null, 0, 0, null, null, null);
taskActionTestKit.getMetadataStorageCoordinator().announceHistoricalSegments(ImmutableSet.of(DataSegment.builder().dataSource(DATA_SOURCE).interval(Granularities.HOUR.bucket(PARTY_TIME)).version(PARTY_TIME.toString()).shardSpec(new NumberedShardSpec(0, 2)).build(), DataSegment.builder().dataSource(DATA_SOURCE).interval(Granularities.HOUR.bucket(PARTY_TIME)).version(PARTY_TIME.toString()).shardSpec(new NumberedShardSpec(1, 2)).build()));
taskActionTestKit.getTaskLockbox().add(task);
final SegmentIdentifier id1 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", null);
final SegmentIdentifier id2 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.HOUR, "s1", id1.getIdentifierAsString());
assertSameIdentifier(id1, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), PARTY_TIME.toString(), new NumberedShardSpec(2, 2)));
assertSameIdentifier(id2, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), PARTY_TIME.toString(), new NumberedShardSpec(3, 2)));
}
use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.
the class IndexTask method generateAndPublishSegments.
private boolean generateAndPublishSegments(final TaskToolbox toolbox, final DataSchema dataSchema, final Map<Interval, List<ShardSpec>> shardSpecs, final String version, final FirehoseFactory firehoseFactory) throws IOException, InterruptedException {
final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null, null), null);
final FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
final Map<String, ShardSpec> sequenceNameToShardSpecMap = Maps.newHashMap();
if (toolbox.getMonitorScheduler() != null) {
toolbox.getMonitorScheduler().addMonitor(new RealtimeMetricsMonitor(ImmutableList.of(fireDepartmentForMetrics), ImmutableMap.of(DruidMetrics.TASK_ID, new String[] { getId() })));
}
final SegmentAllocator segmentAllocator;
if (ingestionSchema.getIOConfig().isAppendToExisting()) {
segmentAllocator = new ActionBasedSegmentAllocator(toolbox.getTaskActionClient(), dataSchema);
} else {
segmentAllocator = new SegmentAllocator() {
@Override
public SegmentIdentifier allocate(DateTime timestamp, String sequenceName, String previousSegmentId) throws IOException {
Optional<Interval> interval = granularitySpec.bucketInterval(timestamp);
if (!interval.isPresent()) {
throw new ISE("Could not find interval for timestamp [%s]", timestamp);
}
ShardSpec shardSpec = sequenceNameToShardSpecMap.get(sequenceName);
if (shardSpec == null) {
throw new ISE("Could not find ShardSpec for sequenceName [%s]", sequenceName);
}
return new SegmentIdentifier(getDataSource(), interval.get(), version, shardSpec);
}
};
}
try (final Appenderator appenderator = newAppenderator(fireDepartmentMetrics, toolbox, dataSchema);
final FiniteAppenderatorDriver driver = newDriver(appenderator, toolbox, segmentAllocator, fireDepartmentMetrics);
final Firehose firehose = firehoseFactory.connect(dataSchema.getParser())) {
final Supplier<Committer> committerSupplier = Committers.supplierFromFirehose(firehose);
final Map<Interval, ShardSpecLookup> shardSpecLookups = Maps.newHashMap();
if (driver.startJob() != null) {
driver.clear();
}
try {
while (firehose.hasMore()) {
try {
final InputRow inputRow = firehose.nextRow();
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
if (!optInterval.isPresent()) {
fireDepartmentMetrics.incrementThrownAway();
continue;
}
final Interval interval = optInterval.get();
if (!shardSpecLookups.containsKey(interval)) {
final List<ShardSpec> intervalShardSpecs = shardSpecs.get(interval);
if (intervalShardSpecs == null || intervalShardSpecs.isEmpty()) {
throw new ISE("Failed to get shardSpec for interval[%s]", interval);
}
shardSpecLookups.put(interval, intervalShardSpecs.get(0).getLookup(intervalShardSpecs));
}
final ShardSpec shardSpec = shardSpecLookups.get(interval).getShardSpec(inputRow.getTimestampFromEpoch(), inputRow);
final String sequenceName = String.format("index_%s_%s_%d", interval, version, shardSpec.getPartitionNum());
if (!sequenceNameToShardSpecMap.containsKey(sequenceName)) {
final ShardSpec shardSpecForPublishing = ingestionSchema.getTuningConfig().isForceExtendableShardSpecs() || ingestionSchema.getIOConfig().isAppendToExisting() ? new NumberedShardSpec(shardSpec.getPartitionNum(), shardSpecs.get(interval).size()) : shardSpec;
sequenceNameToShardSpecMap.put(sequenceName, shardSpecForPublishing);
}
final SegmentIdentifier identifier = driver.add(inputRow, sequenceName, committerSupplier);
if (identifier == null) {
throw new ISE("Could not allocate segment for row with timestamp[%s]", inputRow.getTimestamp());
}
fireDepartmentMetrics.incrementProcessed();
} catch (ParseException e) {
if (ingestionSchema.getTuningConfig().isReportParseExceptions()) {
throw e;
} else {
fireDepartmentMetrics.incrementUnparseable();
}
}
}
} finally {
driver.persist(committerSupplier.get());
}
final TransactionalSegmentPublisher publisher = new TransactionalSegmentPublisher() {
@Override
public boolean publishSegments(Set<DataSegment> segments, Object commitMetadata) throws IOException {
final SegmentTransactionalInsertAction action = new SegmentTransactionalInsertAction(segments, null, null);
return toolbox.getTaskActionClient().submit(action).isSuccess();
}
};
final SegmentsAndMetadata published = driver.finish(publisher, committerSupplier.get());
if (published == null) {
log.error("Failed to publish segments, aborting!");
return false;
} else {
log.info("Published segments[%s]", Joiner.on(", ").join(Iterables.transform(published.getSegments(), new Function<DataSegment, String>() {
@Override
public String apply(DataSegment input) {
return input.getIdentifier();
}
})));
return true;
}
}
}
use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project druid by druid-io.
the class ActionBasedUsedSegmentChecker method findUsedSegments.
@Override
public Set<DataSegment> findUsedSegments(Set<SegmentIdentifier> identifiers) throws IOException {
// Group by dataSource
final Map<String, Set<SegmentIdentifier>> identifiersByDataSource = Maps.newTreeMap();
for (SegmentIdentifier identifier : identifiers) {
if (!identifiersByDataSource.containsKey(identifier.getDataSource())) {
identifiersByDataSource.put(identifier.getDataSource(), Sets.<SegmentIdentifier>newHashSet());
}
identifiersByDataSource.get(identifier.getDataSource()).add(identifier);
}
final Set<DataSegment> retVal = Sets.newHashSet();
for (Map.Entry<String, Set<SegmentIdentifier>> entry : identifiersByDataSource.entrySet()) {
final List<Interval> intervals = JodaUtils.condenseIntervals(Iterables.transform(entry.getValue(), new Function<SegmentIdentifier, Interval>() {
@Override
public Interval apply(SegmentIdentifier input) {
return input.getInterval();
}
}));
final List<DataSegment> usedSegmentsForIntervals = taskActionClient.submit(new SegmentListUsedAction(entry.getKey(), null, intervals));
for (DataSegment segment : usedSegmentsForIntervals) {
if (identifiers.contains(SegmentIdentifier.fromDataSegment(segment))) {
retVal.add(segment);
}
}
}
return retVal;
}
use of io.druid.segment.realtime.appenderator.SegmentIdentifier in project hive by apache.
the class DruidRecordWriter method getSegmentIdentifierAndMaybePush.
/**
* This function computes the segment identifier and push the current open segment
* The push will occur if max size is reached or the event belongs to the next interval.
* Note that this function assumes that timestamps are pseudo sorted.
* This function will close and move to the next segment granularity as soon as
* an event from the next interval appears. The sorting is done by the previous stage.
*
* @return segmentIdentifier with of the truncatedTime and maybe push the current open segment.
*/
private SegmentIdentifier getSegmentIdentifierAndMaybePush(long truncatedTime) {
final Granularity segmentGranularity = dataSchema.getGranularitySpec().getSegmentGranularity();
final Interval interval = new Interval(new DateTime(truncatedTime), segmentGranularity.increment(new DateTime(truncatedTime)));
SegmentIdentifier retVal;
if (currentOpenSegment == null) {
retVal = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(0));
currentOpenSegment = retVal;
return retVal;
} else if (currentOpenSegment.getInterval().equals(interval)) {
retVal = currentOpenSegment;
int rowCount = appenderator.getRowCount(retVal);
if (rowCount < maxPartitionSize) {
return retVal;
} else {
retVal = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(currentOpenSegment.getShardSpec().getPartitionNum() + 1));
pushSegments(Lists.newArrayList(currentOpenSegment));
LOG.info("Creating new partition for segment {}, partition num {}", retVal.getIdentifierAsString(), retVal.getShardSpec().getPartitionNum());
currentOpenSegment = retVal;
return retVal;
}
} else {
retVal = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(0));
pushSegments(Lists.newArrayList(currentOpenSegment));
LOG.info("Creating segment {}", retVal.getIdentifierAsString());
currentOpenSegment = retVal;
return retVal;
}
}
Aggregations