Search in sources :

Example 21 with NumberedShardSpec

use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.

the class SegmentAllocateActionTest method testAddToExistingNumberedShardSpecsFinerPreferredGranularity.

@Test
public void testAddToExistingNumberedShardSpecsFinerPreferredGranularity() throws Exception {
    final Task task = new NoopTask(null, 0, 0, null, null, null);
    taskActionTestKit.getMetadataStorageCoordinator().announceHistoricalSegments(ImmutableSet.of(DataSegment.builder().dataSource(DATA_SOURCE).interval(Granularities.HOUR.bucket(PARTY_TIME)).version(PARTY_TIME.toString()).shardSpec(new NumberedShardSpec(0, 2)).build(), DataSegment.builder().dataSource(DATA_SOURCE).interval(Granularities.HOUR.bucket(PARTY_TIME)).version(PARTY_TIME.toString()).shardSpec(new NumberedShardSpec(1, 2)).build()));
    taskActionTestKit.getTaskLockbox().add(task);
    final SegmentIdentifier id1 = allocate(task, PARTY_TIME, Granularities.NONE, Granularities.MINUTE, "s1", null);
    assertSameIdentifier(id1, new SegmentIdentifier(DATA_SOURCE, Granularities.HOUR.bucket(PARTY_TIME), PARTY_TIME.toString(), new NumberedShardSpec(2, 2)));
}
Also used : Task(io.druid.indexing.common.task.Task) NoopTask(io.druid.indexing.common.task.NoopTask) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) NoopTask(io.druid.indexing.common.task.NoopTask) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) Test(org.junit.Test)

Example 22 with NumberedShardSpec

use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.

the class HadoopDruidIndexerConfigTest method shouldMakeDefaultSegmentOutputPathIfNotHDFS.

@Test
public void shouldMakeDefaultSegmentOutputPathIfNotHDFS() {
    final HadoopIngestionSpec schema;
    try {
        schema = jsonReadWriteRead("{\n" + "    \"dataSchema\": {\n" + "        \"dataSource\": \"the:data:source\",\n" + "        \"metricsSpec\": [],\n" + "        \"granularitySpec\": {\n" + "            \"type\": \"uniform\",\n" + "            \"segmentGranularity\": \"hour\",\n" + "            \"intervals\": [\"2012-07-10/P1D\"]\n" + "        }\n" + "    },\n" + "    \"ioConfig\": {\n" + "        \"type\": \"hadoop\",\n" + "        \"segmentOutputPath\": \"/tmp/dru:id/data:test\"\n" + "    }\n" + "}", HadoopIngestionSpec.class);
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
    HadoopDruidIndexerConfig cfg = new HadoopDruidIndexerConfig(schema.withTuningConfig(schema.getTuningConfig().withVersion("some:brand:new:version")));
    Bucket bucket = new Bucket(4711, new DateTime(2012, 07, 10, 5, 30), 4712);
    Path path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), JobHelper.INDEX_ZIP);
    Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/index.zip", path.toString());
    path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), JobHelper.DESCRIPTOR_JSON);
    Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/descriptor.json", path.toString());
    path = JobHelper.makeTmpPath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), new TaskAttemptID("abc", 123, TaskType.REDUCE, 1, 0));
    Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/4712_index.zip.0", path.toString());
}
Also used : Path(org.apache.hadoop.fs.Path) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) DataSegment(io.druid.timeline.DataSegment) DateTime(org.joda.time.DateTime) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Test(org.junit.Test)

Example 23 with NumberedShardSpec

use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.

the class HadoopDruidIndexerConfigTest method shouldMakeHDFSCompliantSegmentOutputPath.

@Test
public void shouldMakeHDFSCompliantSegmentOutputPath() {
    HadoopIngestionSpec schema;
    try {
        schema = jsonReadWriteRead("{\n" + "    \"dataSchema\": {\n" + "        \"dataSource\": \"source\",\n" + "        \"metricsSpec\": [],\n" + "        \"granularitySpec\": {\n" + "            \"type\": \"uniform\",\n" + "            \"segmentGranularity\": \"hour\",\n" + "            \"intervals\": [\"2012-07-10/P1D\"]\n" + "        }\n" + "    },\n" + "    \"ioConfig\": {\n" + "        \"type\": \"hadoop\",\n" + "        \"segmentOutputPath\": \"hdfs://server:9100/tmp/druid/datatest\"\n" + "    }\n" + "}", HadoopIngestionSpec.class);
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
    HadoopDruidIndexerConfig cfg = new HadoopDruidIndexerConfig(schema.withTuningConfig(schema.getTuningConfig().withVersion("some:brand:new:version")));
    Bucket bucket = new Bucket(4711, new DateTime(2012, 07, 10, 5, 30), 4712);
    Path path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new DistributedFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), JobHelper.INDEX_ZIP);
    Assert.assertEquals("hdfs://server:9100/tmp/druid/datatest/source/20120710T050000.000Z_20120710T060000.000Z/some_brand_new_version" + "/4712_index.zip", path.toString());
    path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new DistributedFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), JobHelper.DESCRIPTOR_JSON);
    Assert.assertEquals("hdfs://server:9100/tmp/druid/datatest/source/20120710T050000.000Z_20120710T060000.000Z/some_brand_new_version" + "/4712_descriptor.json", path.toString());
    path = JobHelper.makeTmpPath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new DistributedFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, -1), new TaskAttemptID("abc", 123, TaskType.REDUCE, 1, 0));
    Assert.assertEquals("hdfs://server:9100/tmp/druid/datatest/source/20120710T050000.000Z_20120710T060000.000Z/some_brand_new_version" + "/4712_index.zip.0", path.toString());
}
Also used : Path(org.apache.hadoop.fs.Path) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) DataSegment(io.druid.timeline.DataSegment) DateTime(org.joda.time.DateTime) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Test(org.junit.Test)

Example 24 with NumberedShardSpec

use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.

the class IndexGeneratorJobTest method verifyJob.

private void verifyJob(IndexGeneratorJob job) throws IOException {
    JobHelper.runJobs(ImmutableList.<Jobby>of(job), config);
    int segmentNum = 0;
    for (DateTime currTime = interval.getStart(); currTime.isBefore(interval.getEnd()); currTime = currTime.plusDays(1)) {
        Object[][] shardInfo = shardInfoForEachSegment[segmentNum++];
        File segmentOutputFolder = new File(String.format("%s/%s/%s_%s/%s", config.getSchema().getIOConfig().getSegmentOutputPath(), config.getSchema().getDataSchema().getDataSource(), currTime.toString(), currTime.plusDays(1).toString(), config.getSchema().getTuningConfig().getVersion()));
        Assert.assertTrue(segmentOutputFolder.exists());
        Assert.assertEquals(shardInfo.length, segmentOutputFolder.list().length);
        for (int partitionNum = 0; partitionNum < shardInfo.length; ++partitionNum) {
            File individualSegmentFolder = new File(segmentOutputFolder, Integer.toString(partitionNum));
            Assert.assertTrue(individualSegmentFolder.exists());
            File descriptor = new File(individualSegmentFolder, "descriptor.json");
            File indexZip = new File(individualSegmentFolder, "index.zip");
            Assert.assertTrue(descriptor.exists());
            Assert.assertTrue(indexZip.exists());
            DataSegment dataSegment = mapper.readValue(descriptor, DataSegment.class);
            Assert.assertEquals(config.getSchema().getTuningConfig().getVersion(), dataSegment.getVersion());
            Assert.assertEquals(new Interval(currTime, currTime.plusDays(1)), dataSegment.getInterval());
            Assert.assertEquals("local", dataSegment.getLoadSpec().get("type"));
            Assert.assertEquals(indexZip.getCanonicalPath(), dataSegment.getLoadSpec().get("path"));
            Assert.assertEquals(Integer.valueOf(9), dataSegment.getBinaryVersion());
            if (datasourceName.equals("website")) {
                Assert.assertEquals("website", dataSegment.getDataSource());
                Assert.assertEquals("host", dataSegment.getDimensions().get(0));
                Assert.assertEquals("visited_num", dataSegment.getMetrics().get(0));
                Assert.assertEquals("unique_hosts", dataSegment.getMetrics().get(1));
            } else if (datasourceName.equals("inherit_dims")) {
                Assert.assertEquals("inherit_dims", dataSegment.getDataSource());
                Assert.assertEquals(ImmutableList.of("X", "Y", "M", "Q", "B", "F"), dataSegment.getDimensions());
                Assert.assertEquals("count", dataSegment.getMetrics().get(0));
            } else if (datasourceName.equals("inherit_dims2")) {
                Assert.assertEquals("inherit_dims2", dataSegment.getDataSource());
                Assert.assertEquals(ImmutableList.of("B", "F", "M", "Q", "X", "Y"), dataSegment.getDimensions());
                Assert.assertEquals("count", dataSegment.getMetrics().get(0));
            } else {
                Assert.fail("Test did not specify supported datasource name");
            }
            if (forceExtendableShardSpecs) {
                NumberedShardSpec spec = (NumberedShardSpec) dataSegment.getShardSpec();
                Assert.assertEquals(partitionNum, spec.getPartitionNum());
                Assert.assertEquals(shardInfo.length, spec.getPartitions());
            } else if (partitionType.equals("hashed")) {
                Integer[] hashShardInfo = (Integer[]) shardInfo[partitionNum];
                HashBasedNumberedShardSpec spec = (HashBasedNumberedShardSpec) dataSegment.getShardSpec();
                Assert.assertEquals((int) hashShardInfo[0], spec.getPartitionNum());
                Assert.assertEquals((int) hashShardInfo[1], spec.getPartitions());
            } else if (partitionType.equals("single")) {
                String[] singleDimensionShardInfo = (String[]) shardInfo[partitionNum];
                SingleDimensionShardSpec spec = (SingleDimensionShardSpec) dataSegment.getShardSpec();
                Assert.assertEquals(singleDimensionShardInfo[0], spec.getStart());
                Assert.assertEquals(singleDimensionShardInfo[1], spec.getEnd());
            } else {
                throw new RuntimeException(String.format("Invalid partition type:[%s]", partitionType));
            }
        }
    }
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) DataSegment(io.druid.timeline.DataSegment) DateTime(org.joda.time.DateTime) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File) SingleDimensionShardSpec(io.druid.timeline.partition.SingleDimensionShardSpec) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Interval(org.joda.time.Interval)

Example 25 with NumberedShardSpec

use of io.druid.timeline.partition.NumberedShardSpec in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinator method allocatePendingSegment.

@Override
public SegmentIdentifier allocatePendingSegment(final String dataSource, final String sequenceName, final String previousSegmentId, final Interval interval, final String maxVersion) throws IOException {
    Preconditions.checkNotNull(dataSource, "dataSource");
    Preconditions.checkNotNull(sequenceName, "sequenceName");
    Preconditions.checkNotNull(interval, "interval");
    Preconditions.checkNotNull(maxVersion, "maxVersion");
    final String previousSegmentIdNotNull = previousSegmentId == null ? "" : previousSegmentId;
    return connector.retryTransaction(new TransactionCallback<SegmentIdentifier>() {

        @Override
        public SegmentIdentifier inTransaction(Handle handle, TransactionStatus transactionStatus) throws Exception {
            final List<byte[]> existingBytes = handle.createQuery(String.format("SELECT payload FROM %s WHERE " + "dataSource = :dataSource AND " + "sequence_name = :sequence_name AND " + "sequence_prev_id = :sequence_prev_id", dbTables.getPendingSegmentsTable())).bind("dataSource", dataSource).bind("sequence_name", sequenceName).bind("sequence_prev_id", previousSegmentIdNotNull).map(ByteArrayMapper.FIRST).list();
            if (!existingBytes.isEmpty()) {
                final SegmentIdentifier existingIdentifier = jsonMapper.readValue(Iterables.getOnlyElement(existingBytes), SegmentIdentifier.class);
                if (existingIdentifier.getInterval().getStartMillis() == interval.getStartMillis() && existingIdentifier.getInterval().getEndMillis() == interval.getEndMillis()) {
                    log.info("Found existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB", existingIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull);
                    return existingIdentifier;
                } else {
                    log.warn("Cannot use existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB, " + "does not match requested interval[%s]", existingIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull, interval);
                    return null;
                }
            }
            // Make up a pending segment based on existing segments and pending segments in the DB. This works
            // assuming that all tasks inserting segments at a particular point in time are going through the
            // allocatePendingSegment flow. This should be assured through some other mechanism (like task locks).
            final SegmentIdentifier newIdentifier;
            final List<TimelineObjectHolder<String, DataSegment>> existingChunks = getTimelineForIntervalsWithHandle(handle, dataSource, ImmutableList.of(interval)).lookup(interval);
            if (existingChunks.size() > 1) {
                // Not possible to expand more than one chunk with a single segment.
                log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: already have [%,d] chunks.", dataSource, interval, maxVersion, existingChunks.size());
                return null;
            } else {
                SegmentIdentifier max = null;
                if (!existingChunks.isEmpty()) {
                    TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
                    for (PartitionChunk<DataSegment> existing : existingHolder.getObject()) {
                        if (max == null || max.getShardSpec().getPartitionNum() < existing.getObject().getShardSpec().getPartitionNum()) {
                            max = SegmentIdentifier.fromDataSegment(existing.getObject());
                        }
                    }
                }
                final List<SegmentIdentifier> pendings = getPendingSegmentsForIntervalWithHandle(handle, dataSource, interval);
                for (SegmentIdentifier pending : pendings) {
                    if (max == null || pending.getVersion().compareTo(max.getVersion()) > 0 || (pending.getVersion().equals(max.getVersion()) && pending.getShardSpec().getPartitionNum() > max.getShardSpec().getPartitionNum())) {
                        max = pending;
                    }
                }
                if (max == null) {
                    newIdentifier = new SegmentIdentifier(dataSource, interval, maxVersion, new NumberedShardSpec(0, 0));
                } else if (!max.getInterval().equals(interval) || max.getVersion().compareTo(maxVersion) > 0) {
                    log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: conflicting segment[%s].", dataSource, interval, maxVersion, max.getIdentifierAsString());
                    return null;
                } else if (max.getShardSpec() instanceof LinearShardSpec) {
                    newIdentifier = new SegmentIdentifier(dataSource, max.getInterval(), max.getVersion(), new LinearShardSpec(max.getShardSpec().getPartitionNum() + 1));
                } else if (max.getShardSpec() instanceof NumberedShardSpec) {
                    newIdentifier = new SegmentIdentifier(dataSource, max.getInterval(), max.getVersion(), new NumberedShardSpec(max.getShardSpec().getPartitionNum() + 1, ((NumberedShardSpec) max.getShardSpec()).getPartitions()));
                } else {
                    log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: ShardSpec class[%s] used by [%s].", dataSource, interval, maxVersion, max.getShardSpec().getClass(), max.getIdentifierAsString());
                    return null;
                }
            }
            // SELECT -> INSERT can fail due to races; callers must be prepared to retry.
            // Avoiding ON DUPLICATE KEY since it's not portable.
            // Avoiding try/catch since it may cause inadvertent transaction-splitting.
            // UNIQUE key for the row, ensuring sequences do not fork in two directions.
            // Using a single column instead of (sequence_name, sequence_prev_id) as some MySQL storage engines
            // have difficulty with large unique keys (see https://github.com/druid-io/druid/issues/2319)
            final String sequenceNamePrevIdSha1 = BaseEncoding.base16().encode(Hashing.sha1().newHasher().putBytes(StringUtils.toUtf8(sequenceName)).putByte((byte) 0xff).putBytes(StringUtils.toUtf8(previousSegmentIdNotNull)).hash().asBytes());
            handle.createStatement(String.format("INSERT INTO %1$s (id, dataSource, created_date, start, %2$send%2$s, sequence_name, sequence_prev_id, sequence_name_prev_id_sha1, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :sequence_name, :sequence_prev_id, :sequence_name_prev_id_sha1, :payload)", dbTables.getPendingSegmentsTable(), connector.getQuoteString())).bind("id", newIdentifier.getIdentifierAsString()).bind("dataSource", dataSource).bind("created_date", new DateTime().toString()).bind("start", interval.getStart().toString()).bind("end", interval.getEnd().toString()).bind("sequence_name", sequenceName).bind("sequence_prev_id", previousSegmentIdNotNull).bind("sequence_name_prev_id_sha1", sequenceNamePrevIdSha1).bind("payload", jsonMapper.writeValueAsBytes(newIdentifier)).execute();
            log.info("Allocated pending segment [%s] for sequence[%s] (previous = [%s]) in DB", newIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull);
            return newIdentifier;
        }
    }, ALLOCATE_SEGMENT_QUIET_TRIES, SQLMetadataConnector.DEFAULT_MAX_TRIES);
}
Also used : SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) TransactionStatus(org.skife.jdbi.v2.TransactionStatus) DataSegment(io.druid.timeline.DataSegment) SQLException(java.sql.SQLException) IOException(java.io.IOException) CallbackFailedException(org.skife.jdbi.v2.exceptions.CallbackFailedException) DateTime(org.joda.time.DateTime) Handle(org.skife.jdbi.v2.Handle) TimelineObjectHolder(io.druid.timeline.TimelineObjectHolder) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) PartitionChunk(io.druid.timeline.partition.PartitionChunk) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec)

Aggregations

NumberedShardSpec (io.druid.timeline.partition.NumberedShardSpec)25 Test (org.junit.Test)17 DataSegment (io.druid.timeline.DataSegment)11 SegmentIdentifier (io.druid.segment.realtime.appenderator.SegmentIdentifier)10 Interval (org.joda.time.Interval)8 NoopTask (io.druid.indexing.common.task.NoopTask)7 Task (io.druid.indexing.common.task.Task)7 HashBasedNumberedShardSpec (io.druid.timeline.partition.HashBasedNumberedShardSpec)6 DateTime (org.joda.time.DateTime)6 ShardSpec (io.druid.timeline.partition.ShardSpec)5 TaskLock (io.druid.indexing.common.TaskLock)4 Predicate (com.google.common.base.Predicate)3 File (java.io.File)3 Path (org.apache.hadoop.fs.Path)3 ImmutableSegmentLoadInfo (io.druid.client.ImmutableSegmentLoadInfo)2 CoordinatorClient (io.druid.client.coordinator.CoordinatorClient)2 SegmentTransactionalInsertAction (io.druid.indexing.common.actions.SegmentTransactionalInsertAction)2 ISE (io.druid.java.util.common.ISE)2 SegmentDescriptor (io.druid.query.SegmentDescriptor)2 NoneShardSpec (io.druid.timeline.partition.NoneShardSpec)2