Search in sources :

Example 71 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class InputSourceSampler method sample.

public SamplerResponse sample(final InputSource inputSource, // inputFormat can be null only if inputSource.needsFormat() = false or parser is specified.
@Nullable final InputFormat inputFormat, @Nullable final DataSchema dataSchema, @Nullable final SamplerConfig samplerConfig) {
    Preconditions.checkNotNull(inputSource, "inputSource required");
    if (inputSource.needsFormat()) {
        Preconditions.checkNotNull(inputFormat, "inputFormat required");
    }
    final DataSchema nonNullDataSchema = dataSchema == null ? DEFAULT_DATA_SCHEMA : dataSchema;
    final SamplerConfig nonNullSamplerConfig = samplerConfig == null ? SamplerConfig.empty() : samplerConfig;
    final Closer closer = Closer.create();
    final File tempDir = FileUtils.createTempDir();
    closer.register(() -> FileUtils.deleteDirectory(tempDir));
    try {
        final InputSourceReader reader = buildReader(nonNullSamplerConfig, nonNullDataSchema, inputSource, inputFormat, tempDir);
        try (final CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample();
            final IncrementalIndex index = buildIncrementalIndex(nonNullSamplerConfig, nonNullDataSchema);
            final Closer closer1 = closer) {
            List<SamplerResponseRow> responseRows = new ArrayList<>(nonNullSamplerConfig.getNumRows());
            int numRowsIndexed = 0;
            while (responseRows.size() < nonNullSamplerConfig.getNumRows() && iterator.hasNext()) {
                final InputRowListPlusRawValues inputRowListPlusRawValues = iterator.next();
                final List<Map<String, Object>> rawColumnsList = inputRowListPlusRawValues.getRawValuesList();
                final ParseException parseException = inputRowListPlusRawValues.getParseException();
                if (parseException != null) {
                    if (rawColumnsList != null) {
                        // add all rows to response
                        responseRows.addAll(rawColumnsList.stream().map(rawColumns -> new SamplerResponseRow(rawColumns, null, true, parseException.getMessage())).collect(Collectors.toList()));
                    } else {
                        // no data parsed, add one response row
                        responseRows.add(new SamplerResponseRow(null, null, true, parseException.getMessage()));
                    }
                    continue;
                }
                List<InputRow> inputRows = inputRowListPlusRawValues.getInputRows();
                if (inputRows == null) {
                    continue;
                }
                for (int i = 0; i < inputRows.size(); i++) {
                    // InputRowListPlusRawValues guarantees the size of rawColumnsList and inputRows are the same
                    Map<String, Object> rawColumns = rawColumnsList == null ? null : rawColumnsList.get(i);
                    InputRow row = inputRows.get(i);
                    // keep the index of the row to be added to responseRows for further use
                    final int rowIndex = responseRows.size();
                    IncrementalIndexAddResult addResult = index.add(new SamplerInputRow(row, rowIndex), true);
                    if (addResult.hasParseException()) {
                        responseRows.add(new SamplerResponseRow(rawColumns, null, true, addResult.getParseException().getMessage()));
                    } else {
                        // store the raw value; will be merged with the data from the IncrementalIndex later
                        responseRows.add(new SamplerResponseRow(rawColumns, null, null, null));
                        numRowsIndexed++;
                    }
                }
            }
            final List<String> columnNames = index.getColumnNames();
            columnNames.remove(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
            for (Row row : index) {
                Map<String, Object> parsed = new LinkedHashMap<>();
                parsed.put(ColumnHolder.TIME_COLUMN_NAME, row.getTimestampFromEpoch());
                columnNames.forEach(k -> parsed.put(k, row.getRaw(k)));
                Number sortKey = row.getMetric(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
                if (sortKey != null) {
                    responseRows.set(sortKey.intValue(), responseRows.get(sortKey.intValue()).withParsed(parsed));
                }
            }
            // make sure size of responseRows meets the input
            if (responseRows.size() > nonNullSamplerConfig.getNumRows()) {
                responseRows = responseRows.subList(0, nonNullSamplerConfig.getNumRows());
            }
            int numRowsRead = responseRows.size();
            return new SamplerResponse(numRowsRead, numRowsIndexed, responseRows.stream().filter(Objects::nonNull).filter(x -> x.getParsed() != null || x.isUnparseable() != null).collect(Collectors.toList()));
        }
    } catch (Exception e) {
        throw new SamplerException(e, "Failed to sample data: %s", e.getMessage());
    }
}
Also used : ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Closer(org.apache.druid.java.util.common.io.Closer) InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) IncrementalIndex(org.apache.druid.segment.incremental.IncrementalIndex) OnheapIncrementalIndex(org.apache.druid.segment.incremental.OnheapIncrementalIndex) SamplerResponse(org.apache.druid.client.indexing.SamplerResponse) ParseException(org.apache.druid.java.util.common.parsers.ParseException) DataSchema(org.apache.druid.segment.indexing.DataSchema) InputSourceReader(org.apache.druid.data.input.InputSourceReader) TimedShutoffInputSourceReader(org.apache.druid.data.input.impl.TimedShutoffInputSourceReader) IncrementalIndexAddResult(org.apache.druid.segment.incremental.IncrementalIndexAddResult) InputRow(org.apache.druid.data.input.InputRow) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) ParseException(org.apache.druid.java.util.common.parsers.ParseException) Row(org.apache.druid.data.input.Row) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) InputRow(org.apache.druid.data.input.InputRow) File(java.io.File) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 72 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class SeekableStreamSupervisorSpecTest method getDataSchema.

private static DataSchema getDataSchema() {
    List<DimensionSchema> dimensions = new ArrayList<>();
    dimensions.add(StringDimensionSchema.create("dim1"));
    dimensions.add(StringDimensionSchema.create("dim2"));
    return new DataSchema(DATASOURCE, new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(dimensions), new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, ImmutableList.of()), null);
}
Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) ArrayList(java.util.ArrayList) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) DimensionSchema(org.apache.druid.data.input.impl.DimensionSchema)

Example 73 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class SeekableStreamSupervisorStateTest method getDataSchema.

private static DataSchema getDataSchema() {
    List<DimensionSchema> dimensions = new ArrayList<>();
    dimensions.add(StringDimensionSchema.create("dim1"));
    dimensions.add(StringDimensionSchema.create("dim2"));
    return new DataSchema(DATASOURCE, new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(dimensions), new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, ImmutableList.of()), null);
}
Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) ArrayList(java.util.ArrayList) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) DimensionSchema(org.apache.druid.data.input.impl.DimensionSchema)

Example 74 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class TaskAnnouncementTest method testBackwardsCompatibleSerde.

@Test
public void testBackwardsCompatibleSerde() throws Exception {
    final Task task = new RealtimeIndexTask("theid", new TaskResource("rofl", 2), new FireDepartment(new DataSchema("foo", null, new AggregatorFactory[0], null, null, new DefaultObjectMapper()), new RealtimeIOConfig(new LocalFirehoseFactory(new File("lol"), "rofl", null), (schema, config, metrics) -> null), null), null);
    final TaskStatus status = TaskStatus.running(task.getId());
    final TaskAnnouncement announcement = TaskAnnouncement.create(task, status, TaskLocation.unknown());
    final String statusJson = jsonMapper.writeValueAsString(status);
    final String announcementJson = jsonMapper.writeValueAsString(announcement);
    final TaskStatus statusFromStatus = jsonMapper.readValue(statusJson, TaskStatus.class);
    final TaskStatus statusFromAnnouncement = jsonMapper.readValue(announcementJson, TaskStatus.class);
    final TaskAnnouncement announcementFromStatus = jsonMapper.readValue(statusJson, TaskAnnouncement.class);
    final TaskAnnouncement announcementFromAnnouncement = jsonMapper.readValue(announcementJson, TaskAnnouncement.class);
    Assert.assertEquals("theid", statusFromStatus.getId());
    Assert.assertEquals("theid", statusFromAnnouncement.getId());
    Assert.assertEquals("theid", announcementFromStatus.getTaskStatus().getId());
    Assert.assertEquals("theid", announcementFromAnnouncement.getTaskStatus().getId());
    Assert.assertEquals("theid", announcementFromStatus.getTaskResource().getAvailabilityGroup());
    Assert.assertEquals("rofl", announcementFromAnnouncement.getTaskResource().getAvailabilityGroup());
    Assert.assertEquals(1, announcementFromStatus.getTaskResource().getRequiredCapacity());
    Assert.assertEquals(2, announcementFromAnnouncement.getTaskResource().getRequiredCapacity());
}
Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) FireDepartment(org.apache.druid.segment.realtime.FireDepartment) RealtimeIndexTask(org.apache.druid.indexing.common.task.RealtimeIndexTask) Task(org.apache.druid.indexing.common.task.Task) RealtimeIOConfig(org.apache.druid.segment.indexing.RealtimeIOConfig) RealtimeIndexTask(org.apache.druid.indexing.common.task.RealtimeIndexTask) TaskResource(org.apache.druid.indexing.common.task.TaskResource) DefaultObjectMapper(org.apache.druid.jackson.DefaultObjectMapper) LocalFirehoseFactory(org.apache.druid.segment.realtime.firehose.LocalFirehoseFactory) TaskStatus(org.apache.druid.indexer.TaskStatus) File(java.io.File) Test(org.junit.Test)

Example 75 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class SeekableStreamIndexTaskRunnerAuthTest method setUp.

@Before
public void setUp() {
    // Create an AuthorizerMapper that only allows access to a Datasource resource
    AuthorizerMapper authorizerMapper = new AuthorizerMapper(null) {

        @Override
        public Authorizer getAuthorizer(String name) {
            return (authenticationResult, resource, action) -> {
                final String username = authenticationResult.getIdentity();
                // - or, Datasource Write User requests Write access
                if (resource.getType().equals(ResourceType.DATASOURCE)) {
                    return new Access((action == Action.READ && username.equals(Users.DATASOURCE_READ)) || (action == Action.WRITE && username.equals(Users.DATASOURCE_WRITE)));
                }
                // Do not allow access to any other resource
                return new Access(false);
            };
        }
    };
    DataSchema dataSchema = new DataSchema("datasource", new TimestampSpec(null, null, null), new DimensionsSpec(Collections.emptyList()), new AggregatorFactory[] {}, new ArbitraryGranularitySpec(new AllGranularity(), Collections.emptyList()), TransformSpec.NONE, null, null);
    SeekableStreamIndexTaskTuningConfig tuningConfig = mock(SeekableStreamIndexTaskTuningConfig.class);
    SeekableStreamIndexTaskIOConfig<String, String> ioConfig = new TestSeekableStreamIndexTaskIOConfig();
    // Initiliaze task and task runner
    SeekableStreamIndexTask<String, String, ByteEntity> indexTask = new TestSeekableStreamIndexTask("id", dataSchema, tuningConfig, ioConfig);
    taskRunner = new TestSeekableStreamIndexTaskRunner(indexTask, authorizerMapper);
}
Also used : TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) StreamPartition(org.apache.druid.indexing.seekablestream.common.StreamPartition) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) RecordSupplier(org.apache.druid.indexing.seekablestream.common.RecordSupplier) OrderedPartitionableRecord(org.apache.druid.indexing.seekablestream.common.OrderedPartitionableRecord) AuthorizerMapper(org.apache.druid.server.security.AuthorizerMapper) EasyMock.mock(org.easymock.EasyMock.mock) AllGranularity(org.apache.druid.java.util.common.granularity.AllGranularity) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) AuthenticationResult(org.apache.druid.server.security.AuthenticationResult) HttpServletRequest(javax.servlet.http.HttpServletRequest) Map(java.util.Map) ForbiddenException(org.apache.druid.server.security.ForbiddenException) EasyMock.replay(org.easymock.EasyMock.replay) AuthConfig(org.apache.druid.server.security.AuthConfig) TypeReference(com.fasterxml.jackson.core.type.TypeReference) ExpectedException(org.junit.rules.ExpectedException) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) Before(org.junit.Before) DateTimes(org.apache.druid.java.util.common.DateTimes) Access(org.apache.druid.server.security.Access) ResourceType(org.apache.druid.server.security.ResourceType) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Set(java.util.Set) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Test(org.junit.Test) Action(org.apache.druid.server.security.Action) EasyMock(org.easymock.EasyMock) LockGranularity(org.apache.druid.indexing.common.LockGranularity) OrderedSequenceNumber(org.apache.druid.indexing.seekablestream.common.OrderedSequenceNumber) Consumer(java.util.function.Consumer) List(java.util.List) Rule(org.junit.Rule) TreeMap(java.util.TreeMap) ByteEntity(org.apache.druid.data.input.impl.ByteEntity) DataSchema(org.apache.druid.segment.indexing.DataSchema) Collections(java.util.Collections) TransformSpec(org.apache.druid.segment.transform.TransformSpec) Authorizer(org.apache.druid.server.security.Authorizer) ByteEntity(org.apache.druid.data.input.impl.ByteEntity) AllGranularity(org.apache.druid.java.util.common.granularity.AllGranularity) Access(org.apache.druid.server.security.Access) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) DataSchema(org.apache.druid.segment.indexing.DataSchema) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) AuthorizerMapper(org.apache.druid.server.security.AuthorizerMapper) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Before(org.junit.Before)

Aggregations

DataSchema (org.apache.druid.segment.indexing.DataSchema)80 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)49 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)45 Test (org.junit.Test)44 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)32 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)25 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)22 GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)19 InputSource (org.apache.druid.data.input.InputSource)17 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)17 File (java.io.File)16 Map (java.util.Map)15 InputFormat (org.apache.druid.data.input.InputFormat)15 CountAggregatorFactory (org.apache.druid.query.aggregation.CountAggregatorFactory)15 SamplerResponse (org.apache.druid.client.indexing.SamplerResponse)14 SamplerResponseRow (org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow)13 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)13 Interval (org.joda.time.Interval)13 ArrayList (java.util.ArrayList)12 JsonInputFormat (org.apache.druid.data.input.impl.JsonInputFormat)12