Search in sources :

Example 16 with LinearShardSpec

use of io.druid.timeline.partition.LinearShardSpec in project druid by druid-io.

the class CalciteTests method createMockWalker.

public static SpecificSegmentsQuerySegmentWalker createMockWalker(final File tmpDir) {
    final QueryableIndex index1 = IndexBuilder.create().tmpDir(new File(tmpDir, "1")).indexMerger(TestHelper.getTestIndexMergerV9()).schema(INDEX_SCHEMA).rows(ROWS1).buildMMappedIndex();
    final QueryableIndex index2 = IndexBuilder.create().tmpDir(new File(tmpDir, "2")).indexMerger(TestHelper.getTestIndexMergerV9()).schema(INDEX_SCHEMA).rows(ROWS2).buildMMappedIndex();
    return new SpecificSegmentsQuerySegmentWalker(queryRunnerFactoryConglomerate()).add(DataSegment.builder().dataSource(DATASOURCE1).interval(index1.getDataInterval()).version("1").shardSpec(new LinearShardSpec(0)).build(), index1).add(DataSegment.builder().dataSource(DATASOURCE2).interval(index2.getDataInterval()).version("1").shardSpec(new LinearShardSpec(0)).build(), index2);
}
Also used : QueryableIndex(io.druid.segment.QueryableIndex) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) File(java.io.File)

Example 17 with LinearShardSpec

use of io.druid.timeline.partition.LinearShardSpec in project druid by druid-io.

the class IndexerSQLMetadataStorageCoordinator method allocatePendingSegment.

@Override
public SegmentIdentifier allocatePendingSegment(final String dataSource, final String sequenceName, final String previousSegmentId, final Interval interval, final String maxVersion) throws IOException {
    Preconditions.checkNotNull(dataSource, "dataSource");
    Preconditions.checkNotNull(sequenceName, "sequenceName");
    Preconditions.checkNotNull(interval, "interval");
    Preconditions.checkNotNull(maxVersion, "maxVersion");
    final String previousSegmentIdNotNull = previousSegmentId == null ? "" : previousSegmentId;
    return connector.retryTransaction(new TransactionCallback<SegmentIdentifier>() {

        @Override
        public SegmentIdentifier inTransaction(Handle handle, TransactionStatus transactionStatus) throws Exception {
            final List<byte[]> existingBytes = handle.createQuery(String.format("SELECT payload FROM %s WHERE " + "dataSource = :dataSource AND " + "sequence_name = :sequence_name AND " + "sequence_prev_id = :sequence_prev_id", dbTables.getPendingSegmentsTable())).bind("dataSource", dataSource).bind("sequence_name", sequenceName).bind("sequence_prev_id", previousSegmentIdNotNull).map(ByteArrayMapper.FIRST).list();
            if (!existingBytes.isEmpty()) {
                final SegmentIdentifier existingIdentifier = jsonMapper.readValue(Iterables.getOnlyElement(existingBytes), SegmentIdentifier.class);
                if (existingIdentifier.getInterval().getStartMillis() == interval.getStartMillis() && existingIdentifier.getInterval().getEndMillis() == interval.getEndMillis()) {
                    log.info("Found existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB", existingIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull);
                    return existingIdentifier;
                } else {
                    log.warn("Cannot use existing pending segment [%s] for sequence[%s] (previous = [%s]) in DB, " + "does not match requested interval[%s]", existingIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull, interval);
                    return null;
                }
            }
            // Make up a pending segment based on existing segments and pending segments in the DB. This works
            // assuming that all tasks inserting segments at a particular point in time are going through the
            // allocatePendingSegment flow. This should be assured through some other mechanism (like task locks).
            final SegmentIdentifier newIdentifier;
            final List<TimelineObjectHolder<String, DataSegment>> existingChunks = getTimelineForIntervalsWithHandle(handle, dataSource, ImmutableList.of(interval)).lookup(interval);
            if (existingChunks.size() > 1) {
                // Not possible to expand more than one chunk with a single segment.
                log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: already have [%,d] chunks.", dataSource, interval, maxVersion, existingChunks.size());
                return null;
            } else {
                SegmentIdentifier max = null;
                if (!existingChunks.isEmpty()) {
                    TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
                    for (PartitionChunk<DataSegment> existing : existingHolder.getObject()) {
                        if (max == null || max.getShardSpec().getPartitionNum() < existing.getObject().getShardSpec().getPartitionNum()) {
                            max = SegmentIdentifier.fromDataSegment(existing.getObject());
                        }
                    }
                }
                final List<SegmentIdentifier> pendings = getPendingSegmentsForIntervalWithHandle(handle, dataSource, interval);
                for (SegmentIdentifier pending : pendings) {
                    if (max == null || pending.getVersion().compareTo(max.getVersion()) > 0 || (pending.getVersion().equals(max.getVersion()) && pending.getShardSpec().getPartitionNum() > max.getShardSpec().getPartitionNum())) {
                        max = pending;
                    }
                }
                if (max == null) {
                    newIdentifier = new SegmentIdentifier(dataSource, interval, maxVersion, new NumberedShardSpec(0, 0));
                } else if (!max.getInterval().equals(interval) || max.getVersion().compareTo(maxVersion) > 0) {
                    log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: conflicting segment[%s].", dataSource, interval, maxVersion, max.getIdentifierAsString());
                    return null;
                } else if (max.getShardSpec() instanceof LinearShardSpec) {
                    newIdentifier = new SegmentIdentifier(dataSource, max.getInterval(), max.getVersion(), new LinearShardSpec(max.getShardSpec().getPartitionNum() + 1));
                } else if (max.getShardSpec() instanceof NumberedShardSpec) {
                    newIdentifier = new SegmentIdentifier(dataSource, max.getInterval(), max.getVersion(), new NumberedShardSpec(max.getShardSpec().getPartitionNum() + 1, ((NumberedShardSpec) max.getShardSpec()).getPartitions()));
                } else {
                    log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], maxVersion[%s]: ShardSpec class[%s] used by [%s].", dataSource, interval, maxVersion, max.getShardSpec().getClass(), max.getIdentifierAsString());
                    return null;
                }
            }
            // SELECT -> INSERT can fail due to races; callers must be prepared to retry.
            // Avoiding ON DUPLICATE KEY since it's not portable.
            // Avoiding try/catch since it may cause inadvertent transaction-splitting.
            // UNIQUE key for the row, ensuring sequences do not fork in two directions.
            // Using a single column instead of (sequence_name, sequence_prev_id) as some MySQL storage engines
            // have difficulty with large unique keys (see https://github.com/druid-io/druid/issues/2319)
            final String sequenceNamePrevIdSha1 = BaseEncoding.base16().encode(Hashing.sha1().newHasher().putBytes(StringUtils.toUtf8(sequenceName)).putByte((byte) 0xff).putBytes(StringUtils.toUtf8(previousSegmentIdNotNull)).hash().asBytes());
            handle.createStatement(String.format("INSERT INTO %1$s (id, dataSource, created_date, start, %2$send%2$s, sequence_name, sequence_prev_id, sequence_name_prev_id_sha1, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :sequence_name, :sequence_prev_id, :sequence_name_prev_id_sha1, :payload)", dbTables.getPendingSegmentsTable(), connector.getQuoteString())).bind("id", newIdentifier.getIdentifierAsString()).bind("dataSource", dataSource).bind("created_date", new DateTime().toString()).bind("start", interval.getStart().toString()).bind("end", interval.getEnd().toString()).bind("sequence_name", sequenceName).bind("sequence_prev_id", previousSegmentIdNotNull).bind("sequence_name_prev_id_sha1", sequenceNamePrevIdSha1).bind("payload", jsonMapper.writeValueAsBytes(newIdentifier)).execute();
            log.info("Allocated pending segment [%s] for sequence[%s] (previous = [%s]) in DB", newIdentifier.getIdentifierAsString(), sequenceName, previousSegmentIdNotNull);
            return newIdentifier;
        }
    }, ALLOCATE_SEGMENT_QUIET_TRIES, SQLMetadataConnector.DEFAULT_MAX_TRIES);
}
Also used : SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) TransactionStatus(org.skife.jdbi.v2.TransactionStatus) DataSegment(io.druid.timeline.DataSegment) SQLException(java.sql.SQLException) IOException(java.io.IOException) CallbackFailedException(org.skife.jdbi.v2.exceptions.CallbackFailedException) DateTime(org.joda.time.DateTime) Handle(org.skife.jdbi.v2.Handle) TimelineObjectHolder(io.druid.timeline.TimelineObjectHolder) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) PartitionChunk(io.druid.timeline.partition.PartitionChunk) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec)

Example 18 with LinearShardSpec

use of io.druid.timeline.partition.LinearShardSpec in project hive by apache.

the class DruidRecordWriter method getSegmentIdentifierAndMaybePush.

/**
 * This function computes the segment identifier and push the current open segment
 * The push will occur if max size is reached or the event belongs to the next interval.
 * Note that this function assumes that timestamps are pseudo sorted.
 * This function will close and move to the next segment granularity as soon as
 * an event from the next interval appears. The sorting is done by the previous stage.
 *
 * @return segmentIdentifier with of the truncatedTime and maybe push the current open segment.
 */
private SegmentIdentifier getSegmentIdentifierAndMaybePush(long truncatedTime) {
    final Interval interval = new Interval(new DateTime(truncatedTime), segmentGranularity.increment(new DateTime(truncatedTime)));
    SegmentIdentifier retVal;
    if (currentOpenSegment == null) {
        currentOpenSegment = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(0));
        return currentOpenSegment;
    } else if (currentOpenSegment.getInterval().equals(interval)) {
        retVal = currentOpenSegment;
        int rowCount = appenderator.getRowCount(retVal);
        if (rowCount < maxPartitionSize) {
            return retVal;
        } else {
            retVal = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(currentOpenSegment.getShardSpec().getPartitionNum() + 1));
            pushSegments(Lists.newArrayList(currentOpenSegment));
            LOG.info("Creating new partition for segment {}, partition num {}", retVal.getIdentifierAsString(), retVal.getShardSpec().getPartitionNum());
            currentOpenSegment = retVal;
            return retVal;
        }
    } else {
        retVal = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(0));
        pushSegments(Lists.newArrayList(currentOpenSegment));
        LOG.info("Creating segment {}", retVal.getIdentifierAsString());
        currentOpenSegment = retVal;
        return retVal;
    }
}
Also used : SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) DateTime(org.joda.time.DateTime) Interval(org.joda.time.Interval)

Example 19 with LinearShardSpec

use of io.druid.timeline.partition.LinearShardSpec in project hive by apache.

the class DruidRecordWriter method write.

@Override
public void write(Writable w) throws IOException {
    DruidWritable record = (DruidWritable) w;
    final long timestamp = (long) record.getValue().get(DruidStorageHandlerUtils.DEFAULT_TIMESTAMP_COLUMN);
    final long truncatedTime = (long) record.getValue().get(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME);
    final int partitionNumber = Math.toIntExact((long) record.getValue().getOrDefault(Constants.DRUID_SHARD_KEY_COL_NAME, -1l));
    final InputRow inputRow = new MapBasedInputRow(timestamp, dataSchema.getParser().getParseSpec().getDimensionsSpec().getDimensionNames(), record.getValue());
    try {
        if (partitionNumber != -1 && maxPartitionSize == -1) {
            final Interval interval = new Interval(new DateTime(truncatedTime), segmentGranularity.increment(new DateTime(truncatedTime)));
            if (currentOpenSegment != null) {
                if (currentOpenSegment.getShardSpec().getPartitionNum() != partitionNumber || !currentOpenSegment.getInterval().equals(interval)) {
                    pushSegments(ImmutableList.of(currentOpenSegment));
                    currentOpenSegment = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(partitionNumber));
                }
            } else if (currentOpenSegment == null) {
                currentOpenSegment = new SegmentIdentifier(dataSchema.getDataSource(), interval, tuningConfig.getVersioningPolicy().getVersion(interval), new LinearShardSpec(partitionNumber));
            }
            appenderator.add(currentOpenSegment, inputRow, committerSupplier);
        } else if (partitionNumber == -1 && maxPartitionSize != -1) {
            appenderator.add(getSegmentIdentifierAndMaybePush(truncatedTime), inputRow, committerSupplier);
        } else {
            throw new IllegalArgumentException(String.format("partitionNumber and  maxPartitionSize should be mutually exclusive got partitionNum [%s] and maxPartitionSize [%s]", partitionNumber, maxPartitionSize));
        }
    } catch (SegmentNotWritableException e) {
        throw new IOException(e);
    }
}
Also used : DruidWritable(org.apache.hadoop.hive.druid.serde.DruidWritable) SegmentIdentifier(io.druid.segment.realtime.appenderator.SegmentIdentifier) SegmentNotWritableException(io.druid.segment.realtime.appenderator.SegmentNotWritableException) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) MapBasedInputRow(io.druid.data.input.MapBasedInputRow) InputRow(io.druid.data.input.InputRow) MapBasedInputRow(io.druid.data.input.MapBasedInputRow) IOException(java.io.IOException) DateTime(org.joda.time.DateTime) Interval(org.joda.time.Interval)

Example 20 with LinearShardSpec

use of io.druid.timeline.partition.LinearShardSpec in project hive by apache.

the class TestDruidStorageHandler method testCommitInsertIntoTable.

@Test
public void testCommitInsertIntoTable() throws MetaException, IOException {
    DerbyConnectorTestUtility connector = derbyConnectorRule.getConnector();
    MetadataStorageTablesConfig metadataStorageTablesConfig = derbyConnectorRule.metadataTablesConfigSupplier().get();
    druidStorageHandler.preCreateTable(tableMock);
    LocalFileSystem localFileSystem = FileSystem.getLocal(config);
    Path taskDirPath = new Path(tableWorkingPath, druidStorageHandler.makeStagingName());
    List<DataSegment> existingSegments = Arrays.asList(createSegment(new Path(taskDirPath, DruidStorageHandlerUtils.INDEX_ZIP).toString(), new Interval(100, 150, DateTimeZone.UTC), "v0", new LinearShardSpec(1)));
    HdfsDataSegmentPusherConfig pusherConfig = new HdfsDataSegmentPusherConfig();
    pusherConfig.setStorageDirectory(config.get(String.valueOf(HiveConf.ConfVars.DRUID_SEGMENT_DIRECTORY)));
    DataSegmentPusher dataSegmentPusher = new HdfsDataSegmentPusher(pusherConfig, config, DruidStorageHandlerUtils.JSON_MAPPER);
    DruidStorageHandlerUtils.publishSegmentsAndCommit(connector, metadataStorageTablesConfig, DATA_SOURCE_NAME, existingSegments, true, config, dataSegmentPusher);
    DataSegment dataSegment = createSegment(new Path(taskDirPath, DruidStorageHandlerUtils.INDEX_ZIP).toString(), new Interval(100, 150, DateTimeZone.UTC), "v1", new LinearShardSpec(0));
    Path descriptorPath = DruidStorageHandlerUtils.makeSegmentDescriptorOutputPath(dataSegment, new Path(taskDirPath, DruidStorageHandler.SEGMENTS_DESCRIPTOR_DIR_NAME));
    DruidStorageHandlerUtils.writeSegmentDescriptor(localFileSystem, dataSegment, descriptorPath);
    druidStorageHandler.commitInsertTable(tableMock, false);
    Assert.assertArrayEquals(Lists.newArrayList(DATA_SOURCE_NAME).toArray(), Lists.newArrayList(DruidStorageHandlerUtils.getAllDataSourceNames(connector, metadataStorageTablesConfig)).toArray());
    final List<DataSegment> dataSegmentList = getUsedSegmentsList(connector, metadataStorageTablesConfig);
    Assert.assertEquals(2, dataSegmentList.size());
    DataSegment persistedSegment = dataSegmentList.get(1);
    // Insert into appends to old version
    Assert.assertEquals("v0", persistedSegment.getVersion());
    Assert.assertTrue(persistedSegment.getShardSpec() instanceof LinearShardSpec);
    Assert.assertEquals(2, persistedSegment.getShardSpec().getPartitionNum());
    Path expectedFinalHadoopPath = new Path(dataSegmentPusher.getPathForHadoop(), dataSegmentPusher.makeIndexPathName(persistedSegment, DruidStorageHandlerUtils.INDEX_ZIP));
    Assert.assertEquals(ImmutableMap.of("type", "hdfs", "path", expectedFinalHadoopPath.toString()), persistedSegment.getLoadSpec());
    Assert.assertEquals("dummySegmentData", FileUtils.readFileToString(new File(expectedFinalHadoopPath.toUri())));
}
Also used : Path(org.apache.hadoop.fs.Path) MetadataStorageTablesConfig(io.druid.metadata.MetadataStorageTablesConfig) HdfsDataSegmentPusher(io.druid.storage.hdfs.HdfsDataSegmentPusher) DataSegmentPusher(io.druid.segment.loading.DataSegmentPusher) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) HdfsDataSegmentPusherConfig(io.druid.storage.hdfs.HdfsDataSegmentPusherConfig) DataSegment(io.druid.timeline.DataSegment) HdfsDataSegmentPusher(io.druid.storage.hdfs.HdfsDataSegmentPusher) File(java.io.File) Interval(org.joda.time.Interval) Test(org.junit.Test)

Aggregations

LinearShardSpec (io.druid.timeline.partition.LinearShardSpec)21 Interval (org.joda.time.Interval)17 DataSegment (io.druid.timeline.DataSegment)12 Test (org.junit.Test)12 DataSegmentPusher (io.druid.segment.loading.DataSegmentPusher)8 File (java.io.File)8 MetadataStorageTablesConfig (io.druid.metadata.MetadataStorageTablesConfig)7 HdfsDataSegmentPusher (io.druid.storage.hdfs.HdfsDataSegmentPusher)7 HdfsDataSegmentPusherConfig (io.druid.storage.hdfs.HdfsDataSegmentPusherConfig)7 LocalFileSystem (org.apache.hadoop.fs.LocalFileSystem)7 Path (org.apache.hadoop.fs.Path)7 CountAggregatorFactory (io.druid.query.aggregation.CountAggregatorFactory)5 SegmentIdentifier (io.druid.segment.realtime.appenderator.SegmentIdentifier)5 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)4 IOException (java.io.IOException)4 LongSumAggregatorFactory (io.druid.query.aggregation.LongSumAggregatorFactory)3 QueryableIndex (io.druid.segment.QueryableIndex)3 DateTime (org.joda.time.DateTime)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 InputRow (io.druid.data.input.InputRow)2