Search in sources :

Example 16 with SamplerResponse

use of org.apache.druid.client.indexing.SamplerResponse in project druid by druid-io.

the class InputSourceSamplerTest method testWithTransformsDimensionsSpec.

@Test
public void testWithTransformsDimensionsSpec() throws IOException {
    final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
    final DimensionsSpec dimensionsSpec = new DimensionsSpec(ImmutableList.of(StringDimensionSchema.create("dim1PlusBar")));
    final TransformSpec transformSpec = new TransformSpec(null, ImmutableList.of(new ExpressionTransform("dim1PlusBar", "concat(dim1 + 'bar')", TestExprMacroTable.INSTANCE)));
    final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
    final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, true, null);
    final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, transformSpec);
    final InputSource inputSource = createInputSource(getTestRows(), dataSchema);
    final InputFormat inputFormat = createInputFormat();
    SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
    Assert.assertEquals(6, response.getNumRowsRead());
    Assert.assertEquals(5, response.getNumRowsIndexed());
    Assert.assertEquals(3, response.getData().size());
    List<SamplerResponseRow> data = response.getData();
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foobar").put("met1", 11L).build(), null, null), data.get(0));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foo2bar").put("met1", 4L).build(), null, null), data.get(1));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(2));
}
Also used : RecordSupplierInputSource(org.apache.druid.indexing.seekablestream.RecordSupplierInputSource) InlineInputSource(org.apache.druid.data.input.impl.InlineInputSource) InputSource(org.apache.druid.data.input.InputSource) SamplerResponse(org.apache.druid.client.indexing.SamplerResponse) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) TransformSpec(org.apache.druid.segment.transform.TransformSpec) DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) InputFormat(org.apache.druid.data.input.InputFormat) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) ExpressionTransform(org.apache.druid.segment.transform.ExpressionTransform) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 17 with SamplerResponse

use of org.apache.druid.client.indexing.SamplerResponse in project druid by druid-io.

the class InputSourceSamplerTest method testIndexParseException.

@Test
public void testIndexParseException() throws IOException {
    final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
    final DimensionsSpec dimensionsSpec = new DimensionsSpec(ImmutableList.of(StringDimensionSchema.create("dim1PlusBar")));
    final TransformSpec transformSpec = new TransformSpec(null, ImmutableList.of(new ExpressionTransform("dim1PlusBar", "concat(dim1 + 'bar')", TestExprMacroTable.INSTANCE)));
    final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
    final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, true, null);
    final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, transformSpec);
    // 
    // add a invalid row to cause parse exception when indexing
    // 
    Map<String, Object> rawColumns4ParseExceptionRow = ImmutableMap.of("t", "2019-04-22T12:00", "dim1", "foo2", "met1", "invalidNumber");
    final List<String> inputTestRows = Lists.newArrayList(getTestRows());
    inputTestRows.add(ParserType.STR_CSV.equals(parserType) ? "2019-04-22T12:00,foo2,,invalidNumber" : OBJECT_MAPPER.writeValueAsString(rawColumns4ParseExceptionRow));
    final InputSource inputSource = createInputSource(inputTestRows, dataSchema);
    final InputFormat inputFormat = createInputFormat();
    SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
    Assert.assertEquals(7, response.getNumRowsRead());
    Assert.assertEquals(5, response.getNumRowsIndexed());
    Assert.assertEquals(4, response.getData().size());
    List<SamplerResponseRow> data = response.getData();
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foobar").put("met1", 11L).build(), null, null), data.get(0));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1PlusBar", "foo2bar").put("met1", 4L).build(), null, null), data.get(1));
    assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(2));
    // 
    // the last row has parse exception when indexing, check if rawColumns and exception message match the expected
    // 
    String indexParseExceptioMessage = ParserType.STR_CSV.equals(parserType) ? "Found unparseable columns in row: [SamplerInputRow{row=TransformedInputRow{row=MapBasedInputRow{timestamp=2019-04-22T12:00:00.000Z, event={t=2019-04-22T12:00, dim1=foo2, dim2=null, met1=invalidNumber}, dimensions=[dim1PlusBar]}}}], exceptions: [Unable to parse value[invalidNumber] for field[met1]]" : "Found unparseable columns in row: [SamplerInputRow{row=TransformedInputRow{row=MapBasedInputRow{timestamp=2019-04-22T12:00:00.000Z, event={t=2019-04-22T12:00, dim1=foo2, met1=invalidNumber}, dimensions=[dim1PlusBar]}}}], exceptions: [Unable to parse value[invalidNumber] for field[met1]]";
    assertEqualsSamplerResponseRow(new SamplerResponseRow(rawColumns4ParseExceptionRow, null, true, indexParseExceptioMessage), data.get(3));
}
Also used : RecordSupplierInputSource(org.apache.druid.indexing.seekablestream.RecordSupplierInputSource) InlineInputSource(org.apache.druid.data.input.impl.InlineInputSource) InputSource(org.apache.druid.data.input.InputSource) SamplerResponse(org.apache.druid.client.indexing.SamplerResponse) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) TransformSpec(org.apache.druid.segment.transform.TransformSpec) DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) InputFormat(org.apache.druid.data.input.InputFormat) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) ExpressionTransform(org.apache.druid.segment.transform.ExpressionTransform) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 18 with SamplerResponse

use of org.apache.druid.client.indexing.SamplerResponse in project druid by druid-io.

the class InputSourceSampler method sample.

public SamplerResponse sample(final InputSource inputSource, // inputFormat can be null only if inputSource.needsFormat() = false or parser is specified.
@Nullable final InputFormat inputFormat, @Nullable final DataSchema dataSchema, @Nullable final SamplerConfig samplerConfig) {
    Preconditions.checkNotNull(inputSource, "inputSource required");
    if (inputSource.needsFormat()) {
        Preconditions.checkNotNull(inputFormat, "inputFormat required");
    }
    final DataSchema nonNullDataSchema = dataSchema == null ? DEFAULT_DATA_SCHEMA : dataSchema;
    final SamplerConfig nonNullSamplerConfig = samplerConfig == null ? SamplerConfig.empty() : samplerConfig;
    final Closer closer = Closer.create();
    final File tempDir = FileUtils.createTempDir();
    closer.register(() -> FileUtils.deleteDirectory(tempDir));
    try {
        final InputSourceReader reader = buildReader(nonNullSamplerConfig, nonNullDataSchema, inputSource, inputFormat, tempDir);
        try (final CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample();
            final IncrementalIndex index = buildIncrementalIndex(nonNullSamplerConfig, nonNullDataSchema);
            final Closer closer1 = closer) {
            List<SamplerResponseRow> responseRows = new ArrayList<>(nonNullSamplerConfig.getNumRows());
            int numRowsIndexed = 0;
            while (responseRows.size() < nonNullSamplerConfig.getNumRows() && iterator.hasNext()) {
                final InputRowListPlusRawValues inputRowListPlusRawValues = iterator.next();
                final List<Map<String, Object>> rawColumnsList = inputRowListPlusRawValues.getRawValuesList();
                final ParseException parseException = inputRowListPlusRawValues.getParseException();
                if (parseException != null) {
                    if (rawColumnsList != null) {
                        // add all rows to response
                        responseRows.addAll(rawColumnsList.stream().map(rawColumns -> new SamplerResponseRow(rawColumns, null, true, parseException.getMessage())).collect(Collectors.toList()));
                    } else {
                        // no data parsed, add one response row
                        responseRows.add(new SamplerResponseRow(null, null, true, parseException.getMessage()));
                    }
                    continue;
                }
                List<InputRow> inputRows = inputRowListPlusRawValues.getInputRows();
                if (inputRows == null) {
                    continue;
                }
                for (int i = 0; i < inputRows.size(); i++) {
                    // InputRowListPlusRawValues guarantees the size of rawColumnsList and inputRows are the same
                    Map<String, Object> rawColumns = rawColumnsList == null ? null : rawColumnsList.get(i);
                    InputRow row = inputRows.get(i);
                    // keep the index of the row to be added to responseRows for further use
                    final int rowIndex = responseRows.size();
                    IncrementalIndexAddResult addResult = index.add(new SamplerInputRow(row, rowIndex), true);
                    if (addResult.hasParseException()) {
                        responseRows.add(new SamplerResponseRow(rawColumns, null, true, addResult.getParseException().getMessage()));
                    } else {
                        // store the raw value; will be merged with the data from the IncrementalIndex later
                        responseRows.add(new SamplerResponseRow(rawColumns, null, null, null));
                        numRowsIndexed++;
                    }
                }
            }
            final List<String> columnNames = index.getColumnNames();
            columnNames.remove(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
            for (Row row : index) {
                Map<String, Object> parsed = new LinkedHashMap<>();
                parsed.put(ColumnHolder.TIME_COLUMN_NAME, row.getTimestampFromEpoch());
                columnNames.forEach(k -> parsed.put(k, row.getRaw(k)));
                Number sortKey = row.getMetric(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
                if (sortKey != null) {
                    responseRows.set(sortKey.intValue(), responseRows.get(sortKey.intValue()).withParsed(parsed));
                }
            }
            // make sure size of responseRows meets the input
            if (responseRows.size() > nonNullSamplerConfig.getNumRows()) {
                responseRows = responseRows.subList(0, nonNullSamplerConfig.getNumRows());
            }
            int numRowsRead = responseRows.size();
            return new SamplerResponse(numRowsRead, numRowsIndexed, responseRows.stream().filter(Objects::nonNull).filter(x -> x.getParsed() != null || x.isUnparseable() != null).collect(Collectors.toList()));
        }
    } catch (Exception e) {
        throw new SamplerException(e, "Failed to sample data: %s", e.getMessage());
    }
}
Also used : ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Closer(org.apache.druid.java.util.common.io.Closer) InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) IncrementalIndex(org.apache.druid.segment.incremental.IncrementalIndex) OnheapIncrementalIndex(org.apache.druid.segment.incremental.OnheapIncrementalIndex) SamplerResponse(org.apache.druid.client.indexing.SamplerResponse) ParseException(org.apache.druid.java.util.common.parsers.ParseException) DataSchema(org.apache.druid.segment.indexing.DataSchema) InputSourceReader(org.apache.druid.data.input.InputSourceReader) TimedShutoffInputSourceReader(org.apache.druid.data.input.impl.TimedShutoffInputSourceReader) IncrementalIndexAddResult(org.apache.druid.segment.incremental.IncrementalIndexAddResult) InputRow(org.apache.druid.data.input.InputRow) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) ParseException(org.apache.druid.java.util.common.parsers.ParseException) Row(org.apache.druid.data.input.Row) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) InputRow(org.apache.druid.data.input.InputRow) File(java.io.File) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 19 with SamplerResponse

use of org.apache.druid.client.indexing.SamplerResponse in project druid by druid-io.

the class SamplerResponseTest method testSerde.

@Test
public void testSerde() throws IOException {
    List<SamplerResponse.SamplerResponseRow> data = ImmutableList.of(new SamplerResponse.SamplerResponseRow(ImmutableMap.of("row1", "val1"), ImmutableMap.of("t", 123456, "dim1", "foo", "met1", 6), null, null), new SamplerResponse.SamplerResponseRow(ImmutableMap.of("row2", "val2"), ImmutableMap.of("t", 123457, "dim1", "foo2", "met1", 7), null, null), new SamplerResponse.SamplerResponseRow(ImmutableMap.of("row3", "val3"), null, true, "Could not parse"));
    String out = MAPPER.writeValueAsString(new SamplerResponse(1123, 1112, data));
    String expected = "{\"numRowsRead\":1123,\"numRowsIndexed\":1112,\"data\":[{\"input\":{\"row1\":\"val1\"},\"parsed\":{\"t\":123456,\"dim1\":\"foo\",\"met1\":6}},{\"input\":{\"row2\":\"val2\"},\"parsed\":{\"t\":123457,\"dim1\":\"foo2\",\"met1\":7}},{\"input\":{\"row3\":\"val3\"},\"unparseable\":true,\"error\":\"Could not parse\"}]}";
    Assert.assertEquals(expected, out);
}
Also used : SamplerResponse(org.apache.druid.client.indexing.SamplerResponse) Test(org.junit.Test)

Aggregations

SamplerResponse (org.apache.druid.client.indexing.SamplerResponse)19 Test (org.junit.Test)18 SamplerResponseRow (org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow)15 InputSource (org.apache.druid.data.input.InputSource)14 DataSchema (org.apache.druid.segment.indexing.DataSchema)14 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)14 InlineInputSource (org.apache.druid.data.input.impl.InlineInputSource)13 JsonInputFormat (org.apache.druid.data.input.impl.JsonInputFormat)13 RecordSupplierInputSource (org.apache.druid.indexing.seekablestream.RecordSupplierInputSource)13 InputFormat (org.apache.druid.data.input.InputFormat)12 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)12 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)12 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)11 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)8 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)8 GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)8 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)8 TransformSpec (org.apache.druid.segment.transform.TransformSpec)5 ExpressionTransform (org.apache.druid.segment.transform.ExpressionTransform)4 File (java.io.File)2