use of io.druid.data.input.InputRow in project druid by druid-io.
the class IndexMergerTest method persistAndLoad.
private QueryableIndex persistAndLoad(List<DimensionSchema> schema, InputRow... rows) throws IOException {
IncrementalIndex toPersist = IncrementalIndexTest.createIndex(null, new DimensionsSpec(schema, null, null));
for (InputRow row : rows) {
toPersist.add(row);
}
final File tempDir = temporaryFolder.newFolder();
return closer.closeLater(INDEX_IO.loadIndex(INDEX_MERGER.persist(toPersist, tempDir, indexSpec)));
}
use of io.druid.data.input.InputRow in project druid by druid-io.
the class IndexMergerV9CompatibilityTest method setUp.
@Before
public void setUp() throws IOException {
toPersist = new OnheapIncrementalIndex(JodaUtils.MIN_INSTANT, Granularities.NONE, DEFAULT_AGG_FACTORIES, 1000000);
toPersist.getMetadata().put("key", "value");
for (InputRow event : events) {
toPersist.add(event);
}
tmpDir = Files.createTempDir();
persistTmpDir = new File(tmpDir, "persistDir");
INDEX_MERGER.persist(toPersist, persistTmpDir, INDEX_SPEC);
}
use of io.druid.data.input.InputRow in project druid by druid-io.
the class IndexTask method generateAndPublishSegments.
private boolean generateAndPublishSegments(final TaskToolbox toolbox, final DataSchema dataSchema, final Map<Interval, List<ShardSpec>> shardSpecs, final String version, final FirehoseFactory firehoseFactory) throws IOException, InterruptedException {
final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null, null), null);
final FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
final Map<String, ShardSpec> sequenceNameToShardSpecMap = Maps.newHashMap();
if (toolbox.getMonitorScheduler() != null) {
toolbox.getMonitorScheduler().addMonitor(new RealtimeMetricsMonitor(ImmutableList.of(fireDepartmentForMetrics), ImmutableMap.of(DruidMetrics.TASK_ID, new String[] { getId() })));
}
final SegmentAllocator segmentAllocator;
if (ingestionSchema.getIOConfig().isAppendToExisting()) {
segmentAllocator = new ActionBasedSegmentAllocator(toolbox.getTaskActionClient(), dataSchema);
} else {
segmentAllocator = new SegmentAllocator() {
@Override
public SegmentIdentifier allocate(DateTime timestamp, String sequenceName, String previousSegmentId) throws IOException {
Optional<Interval> interval = granularitySpec.bucketInterval(timestamp);
if (!interval.isPresent()) {
throw new ISE("Could not find interval for timestamp [%s]", timestamp);
}
ShardSpec shardSpec = sequenceNameToShardSpecMap.get(sequenceName);
if (shardSpec == null) {
throw new ISE("Could not find ShardSpec for sequenceName [%s]", sequenceName);
}
return new SegmentIdentifier(getDataSource(), interval.get(), version, shardSpec);
}
};
}
try (final Appenderator appenderator = newAppenderator(fireDepartmentMetrics, toolbox, dataSchema);
final FiniteAppenderatorDriver driver = newDriver(appenderator, toolbox, segmentAllocator, fireDepartmentMetrics);
final Firehose firehose = firehoseFactory.connect(dataSchema.getParser())) {
final Supplier<Committer> committerSupplier = Committers.supplierFromFirehose(firehose);
final Map<Interval, ShardSpecLookup> shardSpecLookups = Maps.newHashMap();
if (driver.startJob() != null) {
driver.clear();
}
try {
while (firehose.hasMore()) {
try {
final InputRow inputRow = firehose.nextRow();
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
if (!optInterval.isPresent()) {
fireDepartmentMetrics.incrementThrownAway();
continue;
}
final Interval interval = optInterval.get();
if (!shardSpecLookups.containsKey(interval)) {
final List<ShardSpec> intervalShardSpecs = shardSpecs.get(interval);
if (intervalShardSpecs == null || intervalShardSpecs.isEmpty()) {
throw new ISE("Failed to get shardSpec for interval[%s]", interval);
}
shardSpecLookups.put(interval, intervalShardSpecs.get(0).getLookup(intervalShardSpecs));
}
final ShardSpec shardSpec = shardSpecLookups.get(interval).getShardSpec(inputRow.getTimestampFromEpoch(), inputRow);
final String sequenceName = String.format("index_%s_%s_%d", interval, version, shardSpec.getPartitionNum());
if (!sequenceNameToShardSpecMap.containsKey(sequenceName)) {
final ShardSpec shardSpecForPublishing = ingestionSchema.getTuningConfig().isForceExtendableShardSpecs() || ingestionSchema.getIOConfig().isAppendToExisting() ? new NumberedShardSpec(shardSpec.getPartitionNum(), shardSpecs.get(interval).size()) : shardSpec;
sequenceNameToShardSpecMap.put(sequenceName, shardSpecForPublishing);
}
final SegmentIdentifier identifier = driver.add(inputRow, sequenceName, committerSupplier);
if (identifier == null) {
throw new ISE("Could not allocate segment for row with timestamp[%s]", inputRow.getTimestamp());
}
fireDepartmentMetrics.incrementProcessed();
} catch (ParseException e) {
if (ingestionSchema.getTuningConfig().isReportParseExceptions()) {
throw e;
} else {
fireDepartmentMetrics.incrementUnparseable();
}
}
}
} finally {
driver.persist(committerSupplier.get());
}
final TransactionalSegmentPublisher publisher = new TransactionalSegmentPublisher() {
@Override
public boolean publishSegments(Set<DataSegment> segments, Object commitMetadata) throws IOException {
final SegmentTransactionalInsertAction action = new SegmentTransactionalInsertAction(segments, null, null);
return toolbox.getTaskActionClient().submit(action).isSuccess();
}
};
final SegmentsAndMetadata published = driver.finish(publisher, committerSupplier.get());
if (published == null) {
log.error("Failed to publish segments, aborting!");
return false;
} else {
log.info("Published segments[%s]", Joiner.on(", ").join(Iterables.transform(published.getSegments(), new Function<DataSegment, String>() {
@Override
public String apply(DataSegment input) {
return input.getIdentifier();
}
})));
return true;
}
}
}
use of io.druid.data.input.InputRow in project druid by druid-io.
the class IndexTask method determineShardSpecs.
/**
* Determines the number of shards for each interval using a hash of queryGranularity timestamp + all dimensions (i.e
* hash-based partitioning). In the future we may want to also support single-dimension partitioning.
*/
private Map<Interval, List<ShardSpec>> determineShardSpecs(final TaskToolbox toolbox, final FirehoseFactory firehoseFactory) throws IOException {
final ObjectMapper jsonMapper = toolbox.getObjectMapper();
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
final Granularity queryGranularity = granularitySpec.getQueryGranularity();
final boolean determineNumPartitions = ingestionSchema.getTuningConfig().getNumShards() == null;
final boolean determineIntervals = !ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
final Map<Interval, List<ShardSpec>> shardSpecs = Maps.newHashMap();
// if we were given number of shards per interval and the intervals, we don't need to scan the data
if (!determineNumPartitions && !determineIntervals) {
log.info("numShards and intervals provided, skipping determine partition scan");
final SortedSet<Interval> intervals = ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().get();
final int numShards = ingestionSchema.getTuningConfig().getNumShards();
for (Interval interval : intervals) {
final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
if (numShards > 1) {
for (int i = 0; i < numShards; i++) {
intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
intervalShardSpecs.add(NoneShardSpec.instance());
}
shardSpecs.put(interval, intervalShardSpecs);
}
return shardSpecs;
}
// determine intervals containing data and prime HLL collectors
final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = Maps.newHashMap();
int thrownAway = 0;
log.info("Determining intervals and shardSpecs");
long determineShardSpecsStartMillis = System.currentTimeMillis();
try (final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
while (firehose.hasMore()) {
final InputRow inputRow = firehose.nextRow();
final Interval interval;
if (determineIntervals) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
if (!optInterval.isPresent()) {
thrownAway++;
continue;
}
interval = optInterval.get();
}
if (!determineNumPartitions) {
// for the interval and don't instantiate a HLL collector
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.<HyperLogLogCollector>absent());
}
continue;
}
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
}
List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
hllCollectors.get(interval).get().add(hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
}
}
if (thrownAway > 0) {
log.warn("Unable to to find a matching interval for [%,d] events", thrownAway);
}
final ImmutableSortedMap<Interval, Optional<HyperLogLogCollector>> sortedMap = ImmutableSortedMap.copyOf(hllCollectors, Comparators.intervalsByStartThenEnd());
for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : sortedMap.entrySet()) {
final Interval interval = entry.getKey();
final Optional<HyperLogLogCollector> collector = entry.getValue();
final int numShards;
if (determineNumPartitions) {
final long numRows = new Double(collector.get().estimateCardinality()).longValue();
numShards = (int) Math.ceil((double) numRows / ingestionSchema.getTuningConfig().getTargetPartitionSize());
log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numShards);
} else {
numShards = ingestionSchema.getTuningConfig().getNumShards();
log.info("Creating [%,d] shards for interval [%s]", numShards, interval);
}
final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
if (numShards > 1) {
for (int i = 0; i < numShards; i++) {
intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
intervalShardSpecs.add(NoneShardSpec.instance());
}
shardSpecs.put(interval, intervalShardSpecs);
}
log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
return shardSpecs;
}
use of io.druid.data.input.InputRow in project druid by druid-io.
the class YeOldePlumberSchool method findPlumber.
@Override
public Plumber findPlumber(final DataSchema schema, final RealtimeTuningConfig config, final FireDepartmentMetrics metrics) {
// There can be only one.
final Sink theSink = new Sink(interval, schema, config.getShardSpec(), version, config.getMaxRowsInMemory(), config.isReportParseExceptions());
// Temporary directory to hold spilled segments.
final File persistDir = new File(tmpSegmentDir, theSink.getSegment().getIdentifier());
// Set of spilled segments. Will be merged at the end.
final Set<File> spilled = Sets.newHashSet();
// IndexMerger implementation.
final IndexMerger theIndexMerger = config.getBuildV9Directly() ? indexMergerV9 : indexMerger;
return new Plumber() {
@Override
public Object startJob() {
return null;
}
@Override
public int add(InputRow row, Supplier<Committer> committerSupplier) throws IndexSizeExceededException {
Sink sink = getSink(row.getTimestampFromEpoch());
if (sink == null) {
return -1;
}
final int numRows = sink.add(row);
if (!sink.canAppendRow()) {
persist(committerSupplier.get());
}
return numRows;
}
private Sink getSink(long timestamp) {
if (theSink.getInterval().contains(timestamp)) {
return theSink;
} else {
return null;
}
}
@Override
public <T> QueryRunner<T> getQueryRunner(Query<T> query) {
throw new UnsupportedOperationException("Don't query me, bro.");
}
@Override
public void persist(Committer committer) {
spillIfSwappable();
committer.run();
}
@Override
public void finishJob() {
// The segment we will upload
File fileToUpload = null;
try {
// User should have persisted everything by now.
Preconditions.checkState(!theSink.swappable(), "All data must be persisted before fininshing the job!");
if (spilled.size() == 0) {
throw new IllegalStateException("Nothing indexed?");
} else if (spilled.size() == 1) {
fileToUpload = Iterables.getOnlyElement(spilled);
} else {
List<QueryableIndex> indexes = Lists.newArrayList();
for (final File oneSpill : spilled) {
indexes.add(indexIO.loadIndex(oneSpill));
}
fileToUpload = new File(tmpSegmentDir, "merged");
theIndexMerger.mergeQueryableIndex(indexes, schema.getGranularitySpec().isRollup(), schema.getAggregators(), fileToUpload, config.getIndexSpec());
}
// Map merged segment so we can extract dimensions
final QueryableIndex mappedSegment = indexIO.loadIndex(fileToUpload);
final DataSegment segmentToUpload = theSink.getSegment().withDimensions(ImmutableList.copyOf(mappedSegment.getAvailableDimensions())).withBinaryVersion(SegmentUtils.getVersionFromDir(fileToUpload));
dataSegmentPusher.push(fileToUpload, segmentToUpload);
log.info("Uploaded segment[%s]", segmentToUpload.getIdentifier());
} catch (Exception e) {
log.warn(e, "Failed to merge and upload");
throw Throwables.propagate(e);
} finally {
try {
if (fileToUpload != null) {
log.info("Deleting Index File[%s]", fileToUpload);
FileUtils.deleteDirectory(fileToUpload);
}
} catch (IOException e) {
log.warn(e, "Error deleting directory[%s]", fileToUpload);
}
}
}
private void spillIfSwappable() {
if (theSink.swappable()) {
final FireHydrant indexToPersist = theSink.swap();
final int rowsToPersist = indexToPersist.getIndex().size();
final File dirToPersist = getSpillDir(indexToPersist.getCount());
log.info("Spilling index[%d] with rows[%d] to: %s", indexToPersist.getCount(), rowsToPersist, dirToPersist);
try {
theIndexMerger.persist(indexToPersist.getIndex(), dirToPersist, config.getIndexSpec());
indexToPersist.swapSegment(null);
metrics.incrementRowOutputCount(rowsToPersist);
spilled.add(dirToPersist);
} catch (Exception e) {
log.warn(e, "Failed to spill index[%d]", indexToPersist.getCount());
throw Throwables.propagate(e);
}
}
}
private File getSpillDir(final int n) {
return new File(persistDir, String.format("spill%d", n));
}
};
}
Aggregations