use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.
the class SegmentAllocateActionTest method testAddToExistingNumberedShardSpecsFinerPreferredGranularity.
@Test
public void testAddToExistingNumberedShardSpecsFinerPreferredGranularity() throws Exception {
final Task task = new NoopTask(null, 0, 0, null, null, null);
taskActionTestKit.getMetadataStorageCoordinator().announceHistoricalSegments(ImmutableSet.of(DataSegment.builder().dataSource(DATA_SOURCE).interval(Granularities.HOUR.bucket(PARTY_TIME)).version(PARTY_TIME.toString()).shardSpec(new NumberedShardSpec(0, 2)).build(), DataSegment.builder().dataSource(DATA_SOURCE).interval(Granularities.HOUR.bucket(PARTY_TIME)).version(PARTY_TIME.toString()).shardSpec(new NumberedShardSpec(1, 2)).build()));
taskActionTestKit.getTaskLockbox().add(task);
final SegmentIdentifier id1 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.MINUTE, "s1", null);
assertSameIdentifier(id1, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), PARTY_TIME.toString(), new NumberedShardSpec(2, 2)));
}
use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.
the class HadoopDruidIndexerConfigTest method shouldMakeDefaultSegmentOutputPathIfNotHDFS.
@Test
public void shouldMakeDefaultSegmentOutputPathIfNotHDFS() {
final HadoopIngestionSpec schema;
try {
schema = jsonReadWriteRead("{\n" + " \"dataSchema\": {\n" + " \"dataSource\": \"the:data:source\",\n" + " \"metricsSpec\": [],\n" + " \"granularitySpec\": {\n" + " \"type\": \"uniform\",\n" + " \"segmentGranularity\": \"hour\",\n" + " \"intervals\": [\"2012-07-10/P1D\"]\n" + " }\n" + " },\n" + " \"ioConfig\": {\n" + " \"type\": \"hadoop\",\n" + " \"segmentOutputPath\": \"/tmp/dru:id/data:test\"\n" + " }\n" + "}", HadoopIngestionSpec.class);
} catch (Exception e) {
throw Throwables.propagate(e);
}
HadoopDruidIndexerConfig cfg = new HadoopDruidIndexerConfig(schema.withTuningConfig(schema.getTuningConfig().withVersion("some:brand:new:version")));
Bucket bucket = new Bucket(4711, new DateTime(2012, 07, 10, 5, 30), 4712);
Path path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), JobHelper.INDEX_ZIP);
Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/index.zip", path.toString());
path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), JobHelper.DESCRIPTOR_JSON);
Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/descriptor.json", path.toString());
path = JobHelper.makeTmpPath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), new TaskAttemptID("abc", 123, TaskType.REDUCE, 1, 0));
Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/4712_index.zip.0", path.toString());
}
use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.
the class HadoopDruidIndexerConfigTest method shouldMakeHDFSCompliantSegmentOutputPath.
@Test
public void shouldMakeHDFSCompliantSegmentOutputPath() {
HadoopIngestionSpec schema;
try {
schema = jsonReadWriteRead("{\n" + " \"dataSchema\": {\n" + " \"dataSource\": \"source\",\n" + " \"metricsSpec\": [],\n" + " \"granularitySpec\": {\n" + " \"type\": \"uniform\",\n" + " \"segmentGranularity\": \"hour\",\n" + " \"intervals\": [\"2012-07-10/P1D\"]\n" + " }\n" + " },\n" + " \"ioConfig\": {\n" + " \"type\": \"hadoop\",\n" + " \"segmentOutputPath\": \"hdfs://server:9100/tmp/druid/datatest\"\n" + " }\n" + "}", HadoopIngestionSpec.class);
} catch (Exception e) {
throw Throwables.propagate(e);
}
HadoopDruidIndexerConfig cfg = new HadoopDruidIndexerConfig(schema.withTuningConfig(schema.getTuningConfig().withVersion("some:brand:new:version")));
Bucket bucket = new Bucket(4711, new DateTime(2012, 07, 10, 5, 30), 4712);
Path path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new DistributedFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), JobHelper.INDEX_ZIP);
Assert.assertEquals("hdfs://server:9100/tmp/druid/datatest/source/20120710T050000.000Z_20120710T060000.000Z/some_brand_new_version" + "/4712_index.zip", path.toString());
path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new DistributedFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), JobHelper.DESCRIPTOR_JSON);
Assert.assertEquals("hdfs://server:9100/tmp/druid/datatest/source/20120710T050000.000Z_20120710T060000.000Z/some_brand_new_version" + "/4712_descriptor.json", path.toString());
path = JobHelper.makeTmpPath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new DistributedFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), new TaskAttemptID("abc", 123, TaskType.REDUCE, 1, 0));
Assert.assertEquals("hdfs://server:9100/tmp/druid/datatest/source/20120710T050000.000Z_20120710T060000.000Z/some_brand_new_version" + "/4712_index.zip.0", path.toString());
}
use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.
the class IndexGeneratorJobTest method verifyJob.
private void verifyJob(IndexGeneratorJob job) throws IOException {
JobHelper.runJobs(ImmutableList.<Jobby>of(job), config);
int segmentNum = 0;
for (DateTime currTime = interval.getStart(); currTime.isBefore(interval.getEnd()); currTime = currTime.plusDays(1)) {
Object[][] shardInfo = shardInfoForEachSegment[segmentNum++];
File segmentOutputFolder = new File(String.format("%s/%s/%s_%s/%s", config.getSchema().getIOConfig().getSegmentOutputPath(), config.getSchema().getDataSchema().getDataSource(), currTime.toString(), currTime.plusDays(1).toString(), config.getSchema().getTuningConfig().getVersion()));
Assert.assertTrue(segmentOutputFolder.exists());
Assert.assertEquals(shardInfo.length, segmentOutputFolder.list().length);
for (int partitionNum = 0; partitionNum < shardInfo.length; ++partitionNum) {
File individualSegmentFolder = new File(segmentOutputFolder, Integer.toString(partitionNum));
Assert.assertTrue(individualSegmentFolder.exists());
File descriptor = new File(individualSegmentFolder, "descriptor.json");
File indexZip = new File(individualSegmentFolder, "index.zip");
Assert.assertTrue(descriptor.exists());
Assert.assertTrue(indexZip.exists());
DataSegment dataSegment = mapper.readValue(descriptor, DataSegment.class);
Assert.assertEquals(config.getSchema().getTuningConfig().getVersion(), dataSegment.getVersion());
Assert.assertEquals(new Interval(currTime, currTime.plusDays(1)), dataSegment.getInterval());
Assert.assertEquals("local", dataSegment.getLoadSpec().get("type"));
Assert.assertEquals(indexZip.getCanonicalPath(), dataSegment.getLoadSpec().get("path"));
Assert.assertEquals(Integer.valueOf(9), dataSegment.getBinaryVersion());
if (datasourceName.equals("website")) {
Assert.assertEquals("website", dataSegment.getDataSource());
Assert.assertEquals("host", dataSegment.getDimensions().get(0));
Assert.assertEquals("visited_num", dataSegment.getMetrics().get(0));
Assert.assertEquals("unique_hosts", dataSegment.getMetrics().get(1));
} else if (datasourceName.equals("inherit_dims")) {
Assert.assertEquals("inherit_dims", dataSegment.getDataSource());
Assert.assertEquals(ImmutableList.of("X", "Y", "M", "Q", "B", "F"), dataSegment.getDimensions());
Assert.assertEquals("count", dataSegment.getMetrics().get(0));
} else if (datasourceName.equals("inherit_dims2")) {
Assert.assertEquals("inherit_dims2", dataSegment.getDataSource());
Assert.assertEquals(ImmutableList.of("B", "F", "M", "Q", "X", "Y"), dataSegment.getDimensions());
Assert.assertEquals("count", dataSegment.getMetrics().get(0));
} else {
Assert.fail("Test did not specify supported datasource name");
}
if (forceExtendableShardSpecs) {
NumberedShardSpec spec = (NumberedShardSpec) dataSegment.getShardSpec();
Assert.assertEquals(partitionNum, spec.getPartitionNum());
Assert.assertEquals(shardInfo.length, spec.getPartitions());
} else if (partitionType.equals("hashed")) {
Integer[] hashShardInfo = (Integer[]) shardInfo[partitionNum];
HashBasedNumberedShardSpec spec = (HashBasedNumberedShardSpec) dataSegment.getShardSpec();
Assert.assertEquals((int) hashShardInfo[0], spec.getPartitionNum());
Assert.assertEquals((int) hashShardInfo[1], spec.getPartitions());
} else if (partitionType.equals("single")) {
String[] singleDimensionShardInfo = (String[]) shardInfo[partitionNum];
SingleDimensionShardSpec spec = (SingleDimensionShardSpec) dataSegment.getShardSpec();
Assert.assertEquals(singleDimensionShardInfo[0], spec.getStart());
Assert.assertEquals(singleDimensionShardInfo[1], spec.getEnd());
} else {
throw new RuntimeException(String.format("Invalid partition type:[%s]", partitionType));
}
}
}
}
use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinator method allocatePendingSegment.
@Override
public SegmentIdentifier allocatePendingSegment(final String dataSource, final String sequenceName, final String previousSegmentId, final Interval interval, final String maxVersion) throws IOException {
Preconditions.checkNotNull(dataSource, "dataSource");
Preconditions.checkNotNull(sequenceName, "sequenceName");
Preconditions.checkNotNull(interval, "interval");
Preconditions.checkNotNull(maxVersion, "maxVersion");
final String previousSegmentIdNotNull = previousSegmentId == null ? "" : previousSegmentId;
return connector.retryTransaction(new TransactionCallback<SegmentIdentifier>() {
@Override
public SegmentIdentifier inTransaction(Handle handle, TransactionStatus transactionStatus) throws Exception {
final List<byte[]> existingBytes = handle.createQuery(String.format("SELECT payload FROM %s WHERE " + "dataSource = :dataSource AND " + "sequence_name = :sequence_name AND " + "sequence_prev_id = :sequence_prev_id", dbTables.getPendingSegmentsTable())).bind("dataSource", dataSource).bind("sequence_name", sequenceName).bind("sequence_prev_id", previousSegmentIdNotNull).map(ByteArrayMapper.FIRST).list();
if (!existingBytes.isEmpty()) {
final SegmentIdentifier existingIdentifier = jsonMapper.readValue(Iterables.getOnlyElement(existingBytes), SegmentIdentifier.class);
if (existingIdentifier.getInterval().getStartMillis() == interval.getStartMillis() && existingIdentifier.getInterval().getEndMillis() == interval.getEndMillis()) {
log.info("Found existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB", existingIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull);
return existingIdentifier;
} else {
log.warn("Cannot use existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB, " + "does not match requested interval[%s]", existingIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull, interval);
return null;
}
}
// Make up a pending segment based on existing segments and pending segments in the DB. This works
// assuming that all tasks inserting segments at a particular point in time are going through the
// allocatePendingSegment flow. This should be assured through some other mechanism (like task locks).
final SegmentIdentifier newIdentifier;
final List<TimelineObjectHolder<String, DataSegment>> existingChunks = getTimelineForIntervalsWithHandle(handle, dataSource, ImmutableList.of(interval)).lookup(interval);
if (existingChunks.size() > 1) {
// Not possible to expand more than one chunk with a single segment.
log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: already have [%,d] chunks.", dataSource, interval, maxVersion, existingChunks.size());
return null;
} else {
SegmentIdentifier max = null;
if (!existingChunks.isEmpty()) {
TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
for (PartitionChunk<DataSegment> existing : existingHolder.getObject()) {
if (max == null || max.getShardSpec().getPartitionNum() < existing.getObject().getShardSpec().getPartitionNum()) {
max = SegmentIdentifier.fromDataSegment(existing.getObject());
}
}
}
final List<SegmentIdentifier> pendings = getPendingSegmentsForIntervalWithHandle(handle, dataSource, interval);
for (SegmentIdentifier pending : pendings) {
if (max == null || pending.getVersion().compareTo(max.getVersion()) > 0 || (pending.getVersion().equals(max.getVersion()) && pending.getShardSpec().getPartitionNum() > max.getShardSpec().getPartitionNum())) {
max = pending;
}
}
if (max == null) {
newIdentifier = new SegmentIdentifier(dataSource, interval, maxVersion, new NumberedShardSpec(0, 0));
} else if (!max.getInterval().equals(interval) || max.getVersion().compareTo(maxVersion) > 0) {
log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: conflicting segment[%s].", dataSource, interval, maxVersion, max.getIdentifierAsString());
return null;
} else if (max.getShardSpec() instanceof LinearShardSpec) {
newIdentifier = new SegmentIdentifier(dataSource, max.getInterval(), max.getVersion(), new LinearShardSpec(max.getShardSpec().getPartitionNum() + 1));
} else if (max.getShardSpec() instanceof NumberedShardSpec) {
newIdentifier = new SegmentIdentifier(dataSource, max.getInterval(), max.getVersion(), new NumberedShardSpec(max.getShardSpec().getPartitionNum() + 1, ((NumberedShardSpec) max.getShardSpec()).getPartitions()));
} else {
log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: ShardSpec class[%s] used by [%s].", dataSource, interval, maxVersion, max.getShardSpec().getClass(), max.getIdentifierAsString());
return null;
}
}
// SELECT -> INSERT can fail due to races; callers must be prepared to retry.
// Avoiding ON DUPLICATE KEY since it's not portable.
// Avoiding try/catch since it may cause inadvertent transaction-splitting.
// UNIQUE key for the row, ensuring sequences do not fork in two directions.
// Using a single column instead of (sequence_name, sequence_prev_id) as some MySQL storage engines
// have difficulty with large unique keys (see https://github.com/druid-io/druid/issues/2319)
final String sequenceNamePrevIdSha1 = BaseEncoding.base16().encode(Hashing.sha1().newHasher().putBytes(StringUtils.toUtf8(sequenceName)).putByte((byte) 0xff).putBytes(StringUtils.toUtf8(previousSegmentIdNotNull)).hash().asBytes());
handle.createStatement(String.format("INSERT INTO %1$s (id, dataSource, created_date, start, %2$send%2$s, sequence_name, sequence_prev_id, sequence_name_prev_id_sha1, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :sequence_name, :sequence_prev_id, :sequence_name_prev_id_sha1, :payload)", dbTables.getPendingSegmentsTable(), connector.getQuoteString())).bind("id", newIdentifier.getIdentifierAsString()).bind("dataSource", dataSource).bind("created_date", new DateTime().toString()).bind("start", interval.getStart().toString()).bind("end", interval.getEnd().toString()).bind("sequence_name", sequenceName).bind("sequence_prev_id", previousSegmentIdNotNull).bind("sequence_name_prev_id_sha1", sequenceNamePrevIdSha1).bind("payload", jsonMapper.writeValueAsBytes(newIdentifier)).execute();
log.info("Allocated pending segment [%s] for sequence[%s] (previous = [%s]) in DB", newIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull);
return newIdentifier;
}
}, ALLOCATE_SEGMENT_QUIET_TRIES, SQLMetadataConnector.DEFAULT_MAX_TRIES);
}
Aggregations