Search in sources :

Example 41 with JSONPathSpec

use of org.apache.druid.java.util.common.parsers.JSONPathSpec in project druid by druid-io.

the class JsonReaderTest method testParseMultipleRows.

@Test
public void testParseMultipleRows() throws IOException {
    final JsonInputFormat format = new JsonInputFormat(new JSONPathSpec(true, ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz", "baz"), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz2", "baz2"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"))), null, null, // make sure JsonReader is used
    false);
    final ByteEntity source = new ByteEntity(StringUtils.toUtf8("{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":1}}" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":2}}\n" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":3}}\n"));
    final InputEntityReader reader = format.createReader(new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ColumnsFilter.all()), source, null);
    final int numExpectedIterations = 3;
    try (CloseableIterator<InputRow> iterator = reader.read()) {
        int numActualIterations = 0;
        while (iterator.hasNext()) {
            final InputRow row = iterator.next();
            final String msgId = String.valueOf(++numActualIterations);
            Assert.assertEquals(DateTimes.of("2019-01-01"), row.getTimestamp());
            Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
            Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("baz")));
            Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("root_baz")));
            Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("path_omg")));
            Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("jq_omg")));
            Assert.assertTrue(row.getDimension("root_baz2").isEmpty());
            Assert.assertTrue(row.getDimension("path_omg2").isEmpty());
            Assert.assertTrue(row.getDimension("jq_omg2").isEmpty());
        }
        Assert.assertEquals(numExpectedIterations, numActualIterations);
    }
}
Also used : InputRow(org.apache.druid.data.input.InputRow) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputEntityReader(org.apache.druid.data.input.InputEntityReader) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Test(org.junit.Test)

Example 42 with JSONPathSpec

use of org.apache.druid.java.util.common.parsers.JSONPathSpec in project druid by druid-io.

the class JsonReaderTest method testSampleMultipleRows.

@Test
public void testSampleMultipleRows() throws IOException {
    final JsonInputFormat format = new JsonInputFormat(new JSONPathSpec(true, ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz", "baz"), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz2", "baz2"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"))), null, null, // make sure JsonReader is used
    false);
    final ByteEntity source = new ByteEntity(StringUtils.toUtf8("{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":1}}" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":2}}\n" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":3}}\n"));
    final InputEntityReader reader = format.createReader(new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ColumnsFilter.all()), source, null);
    int acturalRowCount = 0;
    try (CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample()) {
        while (iterator.hasNext()) {
            final InputRowListPlusRawValues rawValues = iterator.next();
            // 3 rows returned together
            Assert.assertEquals(3, rawValues.getInputRows().size());
            for (int i = 0; i < 3; i++) {
                InputRow row = rawValues.getInputRows().get(i);
                final String msgId = String.valueOf(++acturalRowCount);
                Assert.assertEquals(DateTimes.of("2019-01-01"), row.getTimestamp());
                Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
                Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("baz")));
                Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("root_baz")));
                Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("path_omg")));
                Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("jq_omg")));
                Assert.assertTrue(row.getDimension("root_baz2").isEmpty());
                Assert.assertTrue(row.getDimension("path_omg2").isEmpty());
                Assert.assertTrue(row.getDimension("jq_omg2").isEmpty());
            }
        }
    }
    Assert.assertEquals(3, acturalRowCount);
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputEntityReader(org.apache.druid.data.input.InputEntityReader) InputRow(org.apache.druid.data.input.InputRow) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Test(org.junit.Test)

Example 43 with JSONPathSpec

use of org.apache.druid.java.util.common.parsers.JSONPathSpec in project druid by druid-io.

the class JsonReaderTest method testSamplInvalidJSONText.

@Test
public void testSamplInvalidJSONText() throws IOException {
    final JsonInputFormat format = new JsonInputFormat(new JSONPathSpec(true, ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz", "baz"), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz2", "baz2"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"))), null, null, // make sure JsonReader is used
    false);
    // 2nd row is ill-formed
    final ByteEntity source = new ByteEntity(StringUtils.toUtf8("{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":1}}" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4xxx,\"o\":{\"mg\":2}}\n" + // value of baz is invalid
    "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":3}}\n"));
    final InputEntityReader reader = format.createReader(new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ColumnsFilter.all()), source, null);
    // the invalid character in line 2 stops parsing of the 3-line text in a whole
    // so the total num of iteration is 1
    final int numExpectedIterations = 1;
    try (CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample()) {
        int numActualIterations = 0;
        while (iterator.hasNext()) {
            numActualIterations++;
            final InputRowListPlusRawValues rawValues = iterator.next();
            Assert.assertNotNull(rawValues.getParseException());
        }
        Assert.assertEquals(numExpectedIterations, numActualIterations);
    }
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputEntityReader(org.apache.druid.data.input.InputEntityReader) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Test(org.junit.Test)

Example 44 with JSONPathSpec

use of org.apache.druid.java.util.common.parsers.JSONPathSpec in project druid by druid-io.

the class SinglePhaseParallelIndexingTest method testIngestBothExplicitAndImplicitDims.

@Test
public void testIngestBothExplicitAndImplicitDims() throws IOException {
    final Interval interval = Intervals.of("2017-12/P1M");
    for (int i = 0; i < 5; i++) {
        try (final Writer writer = Files.newBufferedWriter(new File(inputDir, "test_" + i + ".json").toPath(), StandardCharsets.UTF_8)) {
            writer.write(getObjectMapper().writeValueAsString(ImmutableMap.of("ts", StringUtils.format("2017-12-%d", 24 + i), "implicitDim", "implicit_" + i, "explicitDim", "explicit_" + i)));
            writer.write(getObjectMapper().writeValueAsString(ImmutableMap.of("ts", StringUtils.format("2017-12-%d", 25 + i), "implicitDim", "implicit_" + i, "explicitDim", "explicit_" + i)));
        }
    }
    final ParallelIndexSupervisorTask task = new ParallelIndexSupervisorTask(null, null, null, new ParallelIndexIngestionSpec(new DataSchema("dataSource", DEFAULT_TIMESTAMP_SPEC, DimensionsSpec.builder().setDefaultSchemaDimensions(ImmutableList.of("ts", "explicitDim")).setIncludeAllDimensions(true).build(), new AggregatorFactory[] { new CountAggregatorFactory("cnt") }, new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, Collections.singletonList(interval)), null), new ParallelIndexIOConfig(null, new SettableSplittableLocalInputSource(inputDir, "*.json", true), new JsonInputFormat(new JSONPathSpec(true, null), null, null), false, null), AbstractParallelIndexSupervisorTaskTest.DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING), null);
    task.addToContext(Tasks.FORCE_TIME_CHUNK_LOCK_KEY, lockGranularity == LockGranularity.TIME_CHUNK);
    Assert.assertEquals(TaskState.SUCCESS, getIndexingServiceClient().runAndWait(task).getStatusCode());
    Set<DataSegment> segments = getIndexingServiceClient().getPublishedSegments(task);
    for (DataSegment segment : segments) {
        Assert.assertEquals(ImmutableList.of("ts", "explicitDim", "implicitDim"), segment.getDimensions());
    }
}
Also used : DataSegment(org.apache.druid.timeline.DataSegment) DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) File(java.io.File) Writer(java.io.Writer) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 45 with JSONPathSpec

use of org.apache.druid.java.util.common.parsers.JSONPathSpec in project druid by druid-io.

the class CompatParquetReaderTest method testParquetThriftCompat.

@Test
public void testParquetThriftCompat() throws IOException {
    /*
      message ParquetSchema {
        required boolean boolColumn;
        required int32 byteColumn;
        required int32 shortColumn;
        required int32 intColumn;
        required int64 longColumn;
        required double doubleColumn;
        required binary binaryColumn (UTF8);
        required binary stringColumn (UTF8);
        required binary enumColumn (ENUM);
        optional boolean maybeBoolColumn;
        optional int32 maybeByteColumn;
        optional int32 maybeShortColumn;
        optional int32 maybeIntColumn;
        optional int64 maybeLongColumn;
        optional double maybeDoubleColumn;
        optional binary maybeBinaryColumn (UTF8);
        optional binary maybeStringColumn (UTF8);
        optional binary maybeEnumColumn (ENUM);
        required group stringsColumn (LIST) {
          repeated binary stringsColumn_tuple (UTF8);
        }
        required group intSetColumn (LIST) {
          repeated int32 intSetColumn_tuple;
        }
        required group intToStringColumn (MAP) {
          repeated group map (MAP_KEY_VALUE) {
            required int32 key;
            optional binary value (UTF8);
          }
        }
        required group complexColumn (MAP) {
          repeated group map (MAP_KEY_VALUE) {
            required int32 key;
            optional group value (LIST) {
              repeated group value_tuple {
                required group nestedIntsColumn (LIST) {
                  repeated int32 nestedIntsColumn_tuple;
                }
                required binary nestedStringColumn (UTF8);
              }
            }
          }
        }
      }
     */
    final String file = "example/compat/parquet-thrift-compat.snappy.parquet";
    InputRowSchema schema = new InputRowSchema(new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), ColumnsFilter.all());
    List<JSONPathFieldSpec> flattenExpr = ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.PATH, "extractByLogicalMap", "$.intToStringColumn.1"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "extractByComplexLogicalMap", "$.complexColumn.1[0].nestedIntsColumn[1]"));
    JSONPathSpec flattenSpec = new JSONPathSpec(true, flattenExpr);
    InputEntityReader reader = createReader(file, schema, flattenSpec);
    List<InputRow> rows = readAllRows(reader);
    Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
    Assert.assertEquals("true", rows.get(0).getDimension("boolColumn").get(0));
    Assert.assertEquals("0", rows.get(0).getDimension("byteColumn").get(0));
    Assert.assertEquals("1", rows.get(0).getDimension("shortColumn").get(0));
    Assert.assertEquals("2", rows.get(0).getDimension("intColumn").get(0));
    Assert.assertEquals("0", rows.get(0).getDimension("longColumn").get(0));
    Assert.assertEquals("0.2", rows.get(0).getDimension("doubleColumn").get(0));
    Assert.assertEquals("val_0", rows.get(0).getDimension("binaryColumn").get(0));
    Assert.assertEquals("val_0", rows.get(0).getDimension("stringColumn").get(0));
    Assert.assertEquals("SPADES", rows.get(0).getDimension("enumColumn").get(0));
    Assert.assertTrue(rows.get(0).getDimension("maybeBoolColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeByteColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeShortColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeIntColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeLongColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeDoubleColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeBinaryColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeStringColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeEnumColumn").isEmpty());
    Assert.assertEquals("arr_0", rows.get(0).getDimension("stringsColumn").get(0));
    Assert.assertEquals("arr_1", rows.get(0).getDimension("stringsColumn").get(1));
    Assert.assertEquals("0", rows.get(0).getDimension("intSetColumn").get(0));
    Assert.assertEquals("val_1", rows.get(0).getDimension("extractByLogicalMap").get(0));
    Assert.assertEquals("1", rows.get(0).getDimension("extractByComplexLogicalMap").get(0));
    reader = createReader(file, schema, flattenSpec);
    List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
    final String expectedJson = "{\n" + "  \"enumColumn\" : \"SPADES\",\n" + "  \"maybeStringColumn\" : { },\n" + "  \"maybeBinaryColumn\" : { },\n" + "  \"shortColumn\" : 1,\n" + "  \"byteColumn\" : 0,\n" + "  \"maybeBoolColumn\" : { },\n" + "  \"intColumn\" : 2,\n" + "  \"doubleColumn\" : 0.2,\n" + "  \"maybeByteColumn\" : { },\n" + "  \"intSetColumn\" : [ 0 ],\n" + "  \"boolColumn\" : true,\n" + "  \"binaryColumn\" : \"val_0\",\n" + "  \"maybeIntColumn\" : { },\n" + "  \"intToStringColumn\" : {\n" + "    \"0\" : \"val_0\",\n" + "    \"1\" : \"val_1\",\n" + "    \"2\" : \"val_2\"\n" + "  },\n" + "  \"maybeDoubleColumn\" : { },\n" + "  \"maybeEnumColumn\" : { },\n" + "  \"maybeLongColumn\" : { },\n" + "  \"stringsColumn\" : [ \"arr_0\", \"arr_1\", \"arr_2\" ],\n" + "  \"longColumn\" : 0,\n" + "  \"stringColumn\" : \"val_0\",\n" + "  \"maybeShortColumn\" : { },\n" + "  \"complexColumn\" : {\n" + "    \"0\" : [ {\n" + "      \"nestedStringColumn\" : \"val_0\",\n" + "      \"nestedIntsColumn\" : [ 0, 1, 2 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_1\",\n" + "      \"nestedIntsColumn\" : [ 1, 2, 3 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_2\",\n" + "      \"nestedIntsColumn\" : [ 2, 3, 4 ]\n" + "    } ],\n" + "    \"1\" : [ {\n" + "      \"nestedStringColumn\" : \"val_0\",\n" + "      \"nestedIntsColumn\" : [ 0, 1, 2 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_1\",\n" + "      \"nestedIntsColumn\" : [ 1, 2, 3 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_2\",\n" + "      \"nestedIntsColumn\" : [ 2, 3, 4 ]\n" + "    } ],\n" + "    \"2\" : [ {\n" + "      \"nestedStringColumn\" : \"val_0\",\n" + "      \"nestedIntsColumn\" : [ 0, 1, 2 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_1\",\n" + "      \"nestedIntsColumn\" : [ 1, 2, 3 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_2\",\n" + "      \"nestedIntsColumn\" : [ 2, 3, 4 ]\n" + "    } ]\n" + "  }\n" + "}";
    Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(0).getRawValues()));
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InputEntityReader(org.apache.druid.data.input.InputEntityReader) Test(org.junit.Test)

Aggregations

JSONPathSpec (org.apache.druid.java.util.common.parsers.JSONPathSpec)53 JSONPathFieldSpec (org.apache.druid.java.util.common.parsers.JSONPathFieldSpec)44 Test (org.junit.Test)42 InputEntityReader (org.apache.druid.data.input.InputEntityReader)33 InputRow (org.apache.druid.data.input.InputRow)32 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)30 InputRowSchema (org.apache.druid.data.input.InputRowSchema)28 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)24 InputRowListPlusRawValues (org.apache.druid.data.input.InputRowListPlusRawValues)17 ArrayList (java.util.ArrayList)7 JSONParseSpec (org.apache.druid.data.input.impl.JSONParseSpec)6 JsonInputFormat (org.apache.druid.data.input.impl.JsonInputFormat)6 Configuration (org.apache.hadoop.conf.Configuration)6 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)5 Before (org.junit.Before)5 StringDimensionSchema (org.apache.druid.data.input.impl.StringDimensionSchema)4 Module (com.fasterxml.jackson.databind.Module)3 BigDecimal (java.math.BigDecimal)3 DefaultObjectMapper (org.apache.druid.jackson.DefaultObjectMapper)3 HashMap (java.util.HashMap)2