use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class InputSourceSampler method sample.
public SamplerResponse sample(final InputSource inputSource, // inputFormat can be null only if inputSource.needsFormat() = false or parser is specified.
@Nullable final InputFormat inputFormat, @Nullable final DataSchema dataSchema, @Nullable final SamplerConfig samplerConfig) {
Preconditions.checkNotNull(inputSource, "inputSource required");
if (inputSource.needsFormat()) {
Preconditions.checkNotNull(inputFormat, "inputFormat required");
}
final DataSchema nonNullDataSchema = dataSchema == null ? DEFAULT_DATA_SCHEMA : dataSchema;
final SamplerConfig nonNullSamplerConfig = samplerConfig == null ? SamplerConfig.empty() : samplerConfig;
final Closer closer = Closer.create();
final File tempDir = FileUtils.createTempDir();
closer.register(() -> FileUtils.deleteDirectory(tempDir));
try {
final InputSourceReader reader = buildReader(nonNullSamplerConfig, nonNullDataSchema, inputSource, inputFormat, tempDir);
try (final CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample();
final IncrementalIndex index = buildIncrementalIndex(nonNullSamplerConfig, nonNullDataSchema);
final Closer closer1 = closer) {
List<SamplerResponseRow> responseRows = new ArrayList<>(nonNullSamplerConfig.getNumRows());
int numRowsIndexed = 0;
while (responseRows.size() < nonNullSamplerConfig.getNumRows() && iterator.hasNext()) {
final InputRowListPlusRawValues inputRowListPlusRawValues = iterator.next();
final List<Map<String, Object>> rawColumnsList = inputRowListPlusRawValues.getRawValuesList();
final ParseException parseException = inputRowListPlusRawValues.getParseException();
if (parseException != null) {
if (rawColumnsList != null) {
// add all rows to response
responseRows.addAll(rawColumnsList.stream().map(rawColumns -> new SamplerResponseRow(rawColumns, null, true, parseException.getMessage())).collect(Collectors.toList()));
} else {
// no data parsed, add one response row
responseRows.add(new SamplerResponseRow(null, null, true, parseException.getMessage()));
}
continue;
}
List<InputRow> inputRows = inputRowListPlusRawValues.getInputRows();
if (inputRows == null) {
continue;
}
for (int i = 0; i < inputRows.size(); i++) {
// InputRowListPlusRawValues guarantees the size of rawColumnsList and inputRows are the same
Map<String, Object> rawColumns = rawColumnsList == null ? null : rawColumnsList.get(i);
InputRow row = inputRows.get(i);
// keep the index of the row to be added to responseRows for further use
final int rowIndex = responseRows.size();
IncrementalIndexAddResult addResult = index.add(new SamplerInputRow(row, rowIndex), true);
if (addResult.hasParseException()) {
responseRows.add(new SamplerResponseRow(rawColumns, null, true, addResult.getParseException().getMessage()));
} else {
// store the raw value; will be merged with the data from the IncrementalIndex later
responseRows.add(new SamplerResponseRow(rawColumns, null, null, null));
numRowsIndexed++;
}
}
}
final List<String> columnNames = index.getColumnNames();
columnNames.remove(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
for (Row row : index) {
Map<String, Object> parsed = new LinkedHashMap<>();
parsed.put(ColumnHolder.TIME_COLUMN_NAME, row.getTimestampFromEpoch());
columnNames.forEach(k -> parsed.put(k, row.getRaw(k)));
Number sortKey = row.getMetric(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
if (sortKey != null) {
responseRows.set(sortKey.intValue(), responseRows.get(sortKey.intValue()).withParsed(parsed));
}
}
// make sure size of responseRows meets the input
if (responseRows.size() > nonNullSamplerConfig.getNumRows()) {
responseRows = responseRows.subList(0, nonNullSamplerConfig.getNumRows());
}
int numRowsRead = responseRows.size();
return new SamplerResponse(numRowsRead, numRowsIndexed, responseRows.stream().filter(Objects::nonNull).filter(x -> x.getParsed() != null || x.isUnparseable() != null).collect(Collectors.toList()));
}
} catch (Exception e) {
throw new SamplerException(e, "Failed to sample data: %s", e.getMessage());
}
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class SeekableStreamSupervisorSpecTest method getDataSchema.
private static DataSchema getDataSchema() {
List<DimensionSchema> dimensions = new ArrayList<>();
dimensions.add(StringDimensionSchema.create("dim1"));
dimensions.add(StringDimensionSchema.create("dim2"));
return new DataSchema(DATASOURCE, new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(dimensions), new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, ImmutableList.of()), null);
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class SeekableStreamSupervisorStateTest method getDataSchema.
private static DataSchema getDataSchema() {
List<DimensionSchema> dimensions = new ArrayList<>();
dimensions.add(StringDimensionSchema.create("dim1"));
dimensions.add(StringDimensionSchema.create("dim2"));
return new DataSchema(DATASOURCE, new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(dimensions), new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, ImmutableList.of()), null);
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class TaskAnnouncementTest method testBackwardsCompatibleSerde.
@Test
public void testBackwardsCompatibleSerde() throws Exception {
final Task task = new RealtimeIndexTask("theid", new TaskResource("rofl", 2), new FireDepartment(new DataSchema("foo", null, new AggregatorFactory[0], null, null, new DefaultObjectMapper()), new RealtimeIOConfig(new LocalFirehoseFactory(new File("lol"), "rofl", null), (schema, config, metrics) -> null), null), null);
final TaskStatus status = TaskStatus.running(task.getId());
final TaskAnnouncement announcement = TaskAnnouncement.create(task, status, TaskLocation.unknown());
final String statusJson = jsonMapper.writeValueAsString(status);
final String announcementJson = jsonMapper.writeValueAsString(announcement);
final TaskStatus statusFromStatus = jsonMapper.readValue(statusJson, TaskStatus.class);
final TaskStatus statusFromAnnouncement = jsonMapper.readValue(announcementJson, TaskStatus.class);
final TaskAnnouncement announcementFromStatus = jsonMapper.readValue(statusJson, TaskAnnouncement.class);
final TaskAnnouncement announcementFromAnnouncement = jsonMapper.readValue(announcementJson, TaskAnnouncement.class);
Assert.assertEquals("theid", statusFromStatus.getId());
Assert.assertEquals("theid", statusFromAnnouncement.getId());
Assert.assertEquals("theid", announcementFromStatus.getTaskStatus().getId());
Assert.assertEquals("theid", announcementFromAnnouncement.getTaskStatus().getId());
Assert.assertEquals("theid", announcementFromStatus.getTaskResource().getAvailabilityGroup());
Assert.assertEquals("rofl", announcementFromAnnouncement.getTaskResource().getAvailabilityGroup());
Assert.assertEquals(1, announcementFromStatus.getTaskResource().getRequiredCapacity());
Assert.assertEquals(2, announcementFromAnnouncement.getTaskResource().getRequiredCapacity());
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class SeekableStreamIndexTaskRunnerAuthTest method setUp.
@Before
public void setUp() {
// Create an AuthorizerMapper that only allows access to a Datasource resource
AuthorizerMapper authorizerMapper = new AuthorizerMapper(null) {
@Override
public Authorizer getAuthorizer(String name) {
return (authenticationResult, resource, action) -> {
final String username = authenticationResult.getIdentity();
// - or, Datasource Write User requests Write access
if (resource.getType().equals(ResourceType.DATASOURCE)) {
return new Access((action == Action.READ && username.equals(Users.DATASOURCE_READ)) || (action == Action.WRITE && username.equals(Users.DATASOURCE_WRITE)));
}
// Do not allow access to any other resource
return new Access(false);
};
}
};
DataSchema dataSchema = new DataSchema("datasource", new TimestampSpec(null, null, null), new DimensionsSpec(Collections.emptyList()), new AggregatorFactory[] {}, new ArbitraryGranularitySpec(new AllGranularity(), Collections.emptyList()), TransformSpec.NONE, null, null);
SeekableStreamIndexTaskTuningConfig tuningConfig = mock(SeekableStreamIndexTaskTuningConfig.class);
SeekableStreamIndexTaskIOConfig<String, String> ioConfig = new TestSeekableStreamIndexTaskIOConfig();
// Initiliaze task and task runner
SeekableStreamIndexTask<String, String, ByteEntity> indexTask = new TestSeekableStreamIndexTask("id", dataSchema, tuningConfig, ioConfig);
taskRunner = new TestSeekableStreamIndexTaskRunner(indexTask, authorizerMapper);
}
Aggregations