Search in sources :

Example 31 with InputRowListPlusRawValues

use of org.apache.druid.data.input.InputRowListPlusRawValues in project druid by druid-io.

the class JsonReaderTest method testSamplInvalidJSONText.

@Test
public void testSamplInvalidJSONText() throws IOException {
    final JsonInputFormat format = new JsonInputFormat(new JSONPathSpec(true, ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz", "baz"), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz2", "baz2"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"))), null, null, // make sure JsonReader is used
    false);
    // 2nd row is ill-formed
    final ByteEntity source = new ByteEntity(StringUtils.toUtf8("{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":1}}" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4xxx,\"o\":{\"mg\":2}}\n" + // value of baz is invalid
    "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":3}}\n"));
    final InputEntityReader reader = format.createReader(new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ColumnsFilter.all()), source, null);
    // the invalid character in line 2 stops parsing of the 3-line text in a whole
    // so the total num of iteration is 1
    final int numExpectedIterations = 1;
    try (CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample()) {
        int numActualIterations = 0;
        while (iterator.hasNext()) {
            numActualIterations++;
            final InputRowListPlusRawValues rawValues = iterator.next();
            Assert.assertNotNull(rawValues.getParseException());
        }
        Assert.assertEquals(numExpectedIterations, numActualIterations);
    }
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputEntityReader(org.apache.druid.data.input.InputEntityReader) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Test(org.junit.Test)

Example 32 with InputRowListPlusRawValues

use of org.apache.druid.data.input.InputRowListPlusRawValues in project druid by druid-io.

the class AvroOCFReaderTest method testSample.

@Test
public void testSample() throws Exception {
    final ObjectMapper mapper = new DefaultObjectMapper();
    mapper.setInjectableValues(new InjectableValues.Std().addValue(ObjectMapper.class, mapper));
    final InputEntityReader reader = createReader(mapper, null);
    try (CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample()) {
        Assert.assertTrue(iterator.hasNext());
        final InputRowListPlusRawValues row = iterator.next();
        Assert.assertFalse(iterator.hasNext());
        final Map<String, Object> rawColumns = row.getRawValues();
        Assert.assertNotNull(rawColumns);
        Assert.assertEquals(20, rawColumns.size());
        final List<InputRow> inputRows = row.getInputRows();
        Assert.assertNotNull(inputRows);
        final InputRow inputRow = Iterables.getOnlyElement(inputRows);
        assertInputRow(inputRow);
    }
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) InputRow(org.apache.druid.data.input.InputRow) DefaultObjectMapper(org.apache.druid.jackson.DefaultObjectMapper) InputEntityReader(org.apache.druid.data.input.InputEntityReader) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) DefaultObjectMapper(org.apache.druid.jackson.DefaultObjectMapper) AvroStreamInputRowParserTest(org.apache.druid.data.input.AvroStreamInputRowParserTest) Test(org.junit.Test) AvroHadoopInputRowParserTest(org.apache.druid.data.input.AvroHadoopInputRowParserTest)

Example 33 with InputRowListPlusRawValues

use of org.apache.druid.data.input.InputRowListPlusRawValues in project druid by druid-io.

the class InputSourceSampler method sample.

public SamplerResponse sample(final InputSource inputSource, // inputFormat can be null only if inputSource.needsFormat() = false or parser is specified.
@Nullable final InputFormat inputFormat, @Nullable final DataSchema dataSchema, @Nullable final SamplerConfig samplerConfig) {
    Preconditions.checkNotNull(inputSource, "inputSource required");
    if (inputSource.needsFormat()) {
        Preconditions.checkNotNull(inputFormat, "inputFormat required");
    }
    final DataSchema nonNullDataSchema = dataSchema == null ? DEFAULT_DATA_SCHEMA : dataSchema;
    final SamplerConfig nonNullSamplerConfig = samplerConfig == null ? SamplerConfig.empty() : samplerConfig;
    final Closer closer = Closer.create();
    final File tempDir = FileUtils.createTempDir();
    closer.register(() -> FileUtils.deleteDirectory(tempDir));
    try {
        final InputSourceReader reader = buildReader(nonNullSamplerConfig, nonNullDataSchema, inputSource, inputFormat, tempDir);
        try (final CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample();
            final IncrementalIndex index = buildIncrementalIndex(nonNullSamplerConfig, nonNullDataSchema);
            final Closer closer1 = closer) {
            List<SamplerResponseRow> responseRows = new ArrayList<>(nonNullSamplerConfig.getNumRows());
            int numRowsIndexed = 0;
            while (responseRows.size() < nonNullSamplerConfig.getNumRows() && iterator.hasNext()) {
                final InputRowListPlusRawValues inputRowListPlusRawValues = iterator.next();
                final List<Map<String, Object>> rawColumnsList = inputRowListPlusRawValues.getRawValuesList();
                final ParseException parseException = inputRowListPlusRawValues.getParseException();
                if (parseException != null) {
                    if (rawColumnsList != null) {
                        // add all rows to response
                        responseRows.addAll(rawColumnsList.stream().map(rawColumns -> new SamplerResponseRow(rawColumns, null, true, parseException.getMessage())).collect(Collectors.toList()));
                    } else {
                        // no data parsed, add one response row
                        responseRows.add(new SamplerResponseRow(null, null, true, parseException.getMessage()));
                    }
                    continue;
                }
                List<InputRow> inputRows = inputRowListPlusRawValues.getInputRows();
                if (inputRows == null) {
                    continue;
                }
                for (int i = 0; i < inputRows.size(); i++) {
                    // InputRowListPlusRawValues guarantees the size of rawColumnsList and inputRows are the same
                    Map<String, Object> rawColumns = rawColumnsList == null ? null : rawColumnsList.get(i);
                    InputRow row = inputRows.get(i);
                    // keep the index of the row to be added to responseRows for further use
                    final int rowIndex = responseRows.size();
                    IncrementalIndexAddResult addResult = index.add(new SamplerInputRow(row, rowIndex), true);
                    if (addResult.hasParseException()) {
                        responseRows.add(new SamplerResponseRow(rawColumns, null, true, addResult.getParseException().getMessage()));
                    } else {
                        // store the raw value; will be merged with the data from the IncrementalIndex later
                        responseRows.add(new SamplerResponseRow(rawColumns, null, null, null));
                        numRowsIndexed++;
                    }
                }
            }
            final List<String> columnNames = index.getColumnNames();
            columnNames.remove(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
            for (Row row : index) {
                Map<String, Object> parsed = new LinkedHashMap<>();
                parsed.put(ColumnHolder.TIME_COLUMN_NAME, row.getTimestampFromEpoch());
                columnNames.forEach(k -> parsed.put(k, row.getRaw(k)));
                Number sortKey = row.getMetric(SamplerInputRow.SAMPLER_ORDERING_COLUMN);
                if (sortKey != null) {
                    responseRows.set(sortKey.intValue(), responseRows.get(sortKey.intValue()).withParsed(parsed));
                }
            }
            // make sure size of responseRows meets the input
            if (responseRows.size() > nonNullSamplerConfig.getNumRows()) {
                responseRows = responseRows.subList(0, nonNullSamplerConfig.getNumRows());
            }
            int numRowsRead = responseRows.size();
            return new SamplerResponse(numRowsRead, numRowsIndexed, responseRows.stream().filter(Objects::nonNull).filter(x -> x.getParsed() != null || x.isUnparseable() != null).collect(Collectors.toList()));
        }
    } catch (Exception e) {
        throw new SamplerException(e, "Failed to sample data: %s", e.getMessage());
    }
}
Also used : ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Closer(org.apache.druid.java.util.common.io.Closer) InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) IncrementalIndex(org.apache.druid.segment.incremental.IncrementalIndex) OnheapIncrementalIndex(org.apache.druid.segment.incremental.OnheapIncrementalIndex) SamplerResponse(org.apache.druid.client.indexing.SamplerResponse) ParseException(org.apache.druid.java.util.common.parsers.ParseException) DataSchema(org.apache.druid.segment.indexing.DataSchema) InputSourceReader(org.apache.druid.data.input.InputSourceReader) TimedShutoffInputSourceReader(org.apache.druid.data.input.impl.TimedShutoffInputSourceReader) IncrementalIndexAddResult(org.apache.druid.segment.incremental.IncrementalIndexAddResult) InputRow(org.apache.druid.data.input.InputRow) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) ParseException(org.apache.druid.java.util.common.parsers.ParseException) Row(org.apache.druid.data.input.Row) SamplerResponseRow(org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow) InputRow(org.apache.druid.data.input.InputRow) File(java.io.File) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 34 with InputRowListPlusRawValues

use of org.apache.druid.data.input.InputRowListPlusRawValues in project druid by druid-io.

the class Transformer method transform.

@Nullable
public InputRowListPlusRawValues transform(@Nullable final InputRowListPlusRawValues row) {
    if (row == null) {
        return null;
    }
    final InputRowListPlusRawValues inputRowListPlusRawValues;
    if (transforms.isEmpty() || row.getInputRows() == null) {
        inputRowListPlusRawValues = row;
    } else {
        final List<InputRow> originalRows = row.getInputRows();
        final List<InputRow> transformedRows = new ArrayList<>(originalRows.size());
        for (InputRow originalRow : originalRows) {
            transformedRows.add(new TransformedInputRow(originalRow, transforms));
        }
        inputRowListPlusRawValues = InputRowListPlusRawValues.ofList(row.getRawValuesList(), transformedRows);
    }
    if (valueMatcher != null) {
        if (inputRowListPlusRawValues.getInputRows() != null) {
            // size of inputRows and rawValues are the same
            int size = inputRowListPlusRawValues.getInputRows().size();
            final List<InputRow> matchedRows = new ArrayList<>(size);
            final List<Map<String, Object>> matchedVals = new ArrayList<>(size);
            final List<InputRow> inputRows = inputRowListPlusRawValues.getInputRows();
            final List<Map<String, Object>> inputVals = inputRowListPlusRawValues.getRawValuesList();
            for (int i = 0; i < size; i++) {
                rowSupplierForValueMatcher.set(inputRows.get(i));
                if (valueMatcher.matches()) {
                    matchedRows.add(inputRows.get(i));
                    matchedVals.add(inputVals.get(i));
                }
            }
            return InputRowListPlusRawValues.ofList(matchedVals, matchedRows);
        }
    }
    return inputRowListPlusRawValues;
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) InputRow(org.apache.druid.data.input.InputRow) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map) Nullable(javax.annotation.Nullable)

Aggregations

InputRowListPlusRawValues (org.apache.druid.data.input.InputRowListPlusRawValues)34 Test (org.junit.Test)31 InputRow (org.apache.druid.data.input.InputRow)29 InputEntityReader (org.apache.druid.data.input.InputEntityReader)26 InputRowSchema (org.apache.druid.data.input.InputRowSchema)24 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)21 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)21 JSONPathSpec (org.apache.druid.java.util.common.parsers.JSONPathSpec)17 JSONPathFieldSpec (org.apache.druid.java.util.common.parsers.JSONPathFieldSpec)15 HashMap (java.util.HashMap)4 BigDecimal (java.math.BigDecimal)3 ArrayList (java.util.ArrayList)3 Map (java.util.Map)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 File (java.io.File)2 AvroHadoopInputRowParserTest (org.apache.druid.data.input.AvroHadoopInputRowParserTest)2 AvroStreamInputRowParserTest (org.apache.druid.data.input.AvroStreamInputRowParserTest)2 InputSourceReader (org.apache.druid.data.input.InputSourceReader)2 DefaultObjectMapper (org.apache.druid.jackson.DefaultObjectMapper)2 ImmutableMap (com.google.common.collect.ImmutableMap)1