Search in sources :

Example 41 with InputEntityReader

use of org.apache.druid.data.input.InputEntityReader in project druid by druid-io.

the class JsonReaderTest method testSampleEmptyText.

@Test
public void testSampleEmptyText() throws IOException {
    final JsonInputFormat format = new JsonInputFormat(new JSONPathSpec(true, ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz", "baz"), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz2", "baz2"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"))), null, null, // make sure JsonReader is used
    false);
    // input is empty
    final ByteEntity source = new ByteEntity(StringUtils.toUtf8(""));
    final InputEntityReader reader = format.createReader(new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ColumnsFilter.all()), source, null);
    // the total num of iteration is 1
    final int numExpectedIterations = 1;
    try (CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample()) {
        int numActualIterations = 0;
        while (iterator.hasNext()) {
            numActualIterations++;
            final InputRowListPlusRawValues rawValues = iterator.next();
            Assert.assertNotNull(rawValues.getParseException());
        }
        Assert.assertEquals(numExpectedIterations, numActualIterations);
    }
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputEntityReader(org.apache.druid.data.input.InputEntityReader) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Test(org.junit.Test)

Example 42 with InputEntityReader

use of org.apache.druid.data.input.InputEntityReader in project druid by druid-io.

the class JsonReaderTest method testParseMultipleRows.

@Test
public void testParseMultipleRows() throws IOException {
    final JsonInputFormat format = new JsonInputFormat(new JSONPathSpec(true, ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz", "baz"), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz2", "baz2"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"))), null, null, // make sure JsonReader is used
    false);
    final ByteEntity source = new ByteEntity(StringUtils.toUtf8("{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":1}}" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":2}}\n" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":3}}\n"));
    final InputEntityReader reader = format.createReader(new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ColumnsFilter.all()), source, null);
    final int numExpectedIterations = 3;
    try (CloseableIterator<InputRow> iterator = reader.read()) {
        int numActualIterations = 0;
        while (iterator.hasNext()) {
            final InputRow row = iterator.next();
            final String msgId = String.valueOf(++numActualIterations);
            Assert.assertEquals(DateTimes.of("2019-01-01"), row.getTimestamp());
            Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
            Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("baz")));
            Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("root_baz")));
            Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("path_omg")));
            Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("jq_omg")));
            Assert.assertTrue(row.getDimension("root_baz2").isEmpty());
            Assert.assertTrue(row.getDimension("path_omg2").isEmpty());
            Assert.assertTrue(row.getDimension("jq_omg2").isEmpty());
        }
        Assert.assertEquals(numExpectedIterations, numActualIterations);
    }
}
Also used : InputRow(org.apache.druid.data.input.InputRow) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputEntityReader(org.apache.druid.data.input.InputEntityReader) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Test(org.junit.Test)

Example 43 with InputEntityReader

use of org.apache.druid.data.input.InputEntityReader in project druid by druid-io.

the class JsonReaderTest method testSampleMultipleRows.

@Test
public void testSampleMultipleRows() throws IOException {
    final JsonInputFormat format = new JsonInputFormat(new JSONPathSpec(true, ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz", "baz"), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz2", "baz2"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"))), null, null, // make sure JsonReader is used
    false);
    final ByteEntity source = new ByteEntity(StringUtils.toUtf8("{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":1}}" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":2}}\n" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":3}}\n"));
    final InputEntityReader reader = format.createReader(new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ColumnsFilter.all()), source, null);
    int acturalRowCount = 0;
    try (CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample()) {
        while (iterator.hasNext()) {
            final InputRowListPlusRawValues rawValues = iterator.next();
            // 3 rows returned together
            Assert.assertEquals(3, rawValues.getInputRows().size());
            for (int i = 0; i < 3; i++) {
                InputRow row = rawValues.getInputRows().get(i);
                final String msgId = String.valueOf(++acturalRowCount);
                Assert.assertEquals(DateTimes.of("2019-01-01"), row.getTimestamp());
                Assert.assertEquals("x", Iterables.getOnlyElement(row.getDimension("foo")));
                Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("baz")));
                Assert.assertEquals("4", Iterables.getOnlyElement(row.getDimension("root_baz")));
                Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("path_omg")));
                Assert.assertEquals(msgId, Iterables.getOnlyElement(row.getDimension("jq_omg")));
                Assert.assertTrue(row.getDimension("root_baz2").isEmpty());
                Assert.assertTrue(row.getDimension("path_omg2").isEmpty());
                Assert.assertTrue(row.getDimension("jq_omg2").isEmpty());
            }
        }
    }
    Assert.assertEquals(3, acturalRowCount);
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputEntityReader(org.apache.druid.data.input.InputEntityReader) InputRow(org.apache.druid.data.input.InputRow) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Test(org.junit.Test)

Example 44 with InputEntityReader

use of org.apache.druid.data.input.InputEntityReader in project druid by druid-io.

the class JsonReaderTest method testSamplInvalidJSONText.

@Test
public void testSamplInvalidJSONText() throws IOException {
    final JsonInputFormat format = new JsonInputFormat(new JSONPathSpec(true, ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz", "baz"), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "root_baz2", "baz2"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg", "$.o.mg"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "path_omg2", "$.o.mg2"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg", ".o.mg"), new JSONPathFieldSpec(JSONPathFieldType.JQ, "jq_omg2", ".o.mg2"))), null, null, // make sure JsonReader is used
    false);
    // 2nd row is ill-formed
    final ByteEntity source = new ByteEntity(StringUtils.toUtf8("{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":1}}" + "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4xxx,\"o\":{\"mg\":2}}\n" + // value of baz is invalid
    "{\"timestamp\":\"2019-01-01\",\"bar\":null,\"foo\":\"x\",\"baz\":4,\"o\":{\"mg\":3}}\n"));
    final InputEntityReader reader = format.createReader(new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ColumnsFilter.all()), source, null);
    // the invalid character in line 2 stops parsing of the 3-line text in a whole
    // so the total num of iteration is 1
    final int numExpectedIterations = 1;
    try (CloseableIterator<InputRowListPlusRawValues> iterator = reader.sample()) {
        int numActualIterations = 0;
        while (iterator.hasNext()) {
            numActualIterations++;
            final InputRowListPlusRawValues rawValues = iterator.next();
            Assert.assertNotNull(rawValues.getParseException());
        }
        Assert.assertEquals(numExpectedIterations, numActualIterations);
    }
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputEntityReader(org.apache.druid.data.input.InputEntityReader) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Test(org.junit.Test)

Example 45 with InputEntityReader

use of org.apache.druid.data.input.InputEntityReader in project druid by druid-io.

the class TimestampsParquetReaderTest method testParseInt96Timestamp.

@Test
public void testParseInt96Timestamp() throws IOException {
    // the source parquet file was found in apache spark sql repo tests, where it is known as impala_timestamp.parq
    // it has a single column, "ts" which is an int96 timestamp
    final String file = "example/timestamps/int96_timestamp.parquet";
    InputRowSchema schema = new InputRowSchema(new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), ColumnsFilter.all());
    InputEntityReader reader = createReader(file, schema, JSONPathSpec.DEFAULT);
    List<InputRow> rows = readAllRows(reader);
    Assert.assertEquals("2001-01-01T01:01:01.000Z", rows.get(0).getTimestamp().toString());
    reader = createReader(file, schema, JSONPathSpec.DEFAULT);
    List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
    final String expectedJson = "{\n" + "  \"ts\" : 978310861000\n" + "}";
    Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(0).getRawValues()));
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InputEntityReader(org.apache.druid.data.input.InputEntityReader) Test(org.junit.Test)

Aggregations

InputEntityReader (org.apache.druid.data.input.InputEntityReader)58 Test (org.junit.Test)56 InputRow (org.apache.druid.data.input.InputRow)54 InputRowSchema (org.apache.druid.data.input.InputRowSchema)39 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)33 JSONPathSpec (org.apache.druid.java.util.common.parsers.JSONPathSpec)33 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)31 JSONPathFieldSpec (org.apache.druid.java.util.common.parsers.JSONPathFieldSpec)28 InputRowListPlusRawValues (org.apache.druid.data.input.InputRowListPlusRawValues)26 MapBasedInputRow (org.apache.druid.data.input.MapBasedInputRow)8 Configuration (org.apache.hadoop.conf.Configuration)8 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)5 AvroHadoopInputRowParserTest (org.apache.druid.data.input.AvroHadoopInputRowParserTest)5 AvroStreamInputRowParserTest (org.apache.druid.data.input.AvroStreamInputRowParserTest)5 DefaultObjectMapper (org.apache.druid.jackson.DefaultObjectMapper)5 KafkaRecordEntity (org.apache.druid.data.input.kafka.KafkaRecordEntity)4 ConsumerRecord (org.apache.kafka.clients.consumer.ConsumerRecord)4 Headers (org.apache.kafka.common.header.Headers)4 RecordHeaders (org.apache.kafka.common.header.internals.RecordHeaders)4 BigDecimal (java.math.BigDecimal)3