Search in sources :

Example 46 with InputRowSchema

use of org.apache.druid.data.input.InputRowSchema in project druid by druid-io.

the class WikiParquetReaderTest method testWiki.

@Test
public void testWiki() throws IOException {
    InputRowSchema schema = new InputRowSchema(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("page", "language", "user", "unpatrolled"))), ColumnsFilter.all());
    InputEntityReader reader = createReader("example/wiki/wiki.parquet", schema, JSONPathSpec.DEFAULT);
    List<InputRow> rows = readAllRows(reader);
    Assert.assertEquals("Gypsy Danger", rows.get(0).getDimension("page").get(0));
    String s1 = rows.get(0).getDimension("language").get(0);
    String s2 = rows.get(0).getDimension("language").get(1);
    Assert.assertEquals("en", s1);
    Assert.assertEquals("zh", s2);
    reader = createReader("example/wiki/wiki.parquet", schema, JSONPathSpec.DEFAULT);
    List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
    final String expectedJson = "{\n" + "  \"continent\" : \"North America\",\n" + "  \"country\" : \"United States\",\n" + "  \"added\" : 57,\n" + "  \"city\" : \"San Francisco\",\n" + "  \"unpatrolled\" : \"true\",\n" + "  \"delta\" : -143,\n" + "  \"language\" : [ \"en\", \"zh\" ],\n" + "  \"robot\" : \"false\",\n" + "  \"deleted\" : 200,\n" + "  \"newPage\" : \"true\",\n" + "  \"namespace\" : \"article\",\n" + "  \"anonymous\" : \"false\",\n" + "  \"page\" : \"Gypsy Danger\",\n" + "  \"region\" : \"Bay Area\",\n" + "  \"user\" : \"nuclear\",\n" + "  \"timestamp\" : \"2013-08-31T01:02:33Z\"\n" + "}";
    Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(0).getRawValues()));
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InputEntityReader(org.apache.druid.data.input.InputEntityReader) Test(org.junit.Test)

Example 47 with InputRowSchema

use of org.apache.druid.data.input.InputRowSchema in project druid by druid-io.

the class GoogleCloudStorageInputSourceTest method testReader.

@Test
public void testReader() throws IOException {
    EasyMock.reset(STORAGE);
    EasyMock.reset(INPUT_DATA_CONFIG);
    addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)));
    addExpectedGetObjectMock(EXPECTED_URIS.get(0));
    addExpectedPrefixObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1)));
    addExpectedGetObjectMock(EXPECTED_URIS.get(1));
    EasyMock.expect(INPUT_DATA_CONFIG.getMaxListingLength()).andReturn(MAX_LISTING_LENGTH);
    EasyMock.replay(STORAGE);
    EasyMock.replay(INPUT_DATA_CONFIG);
    GoogleCloudStorageInputSource inputSource = new GoogleCloudStorageInputSource(STORAGE, INPUT_DATA_CONFIG, null, PREFIXES, null);
    InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
    InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), null);
    CloseableIterator<InputRow> iterator = reader.read();
    while (iterator.hasNext()) {
        InputRow nextRow = iterator.next();
        Assert.assertEquals(NOW, nextRow.getTimestamp());
        Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
        Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
    }
}
Also used : InputSourceReader(org.apache.druid.data.input.InputSourceReader) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 48 with InputRowSchema

use of org.apache.druid.data.input.InputRowSchema in project druid by druid-io.

the class CompatParquetReaderTest method testBinaryAsString.

@Test
public void testBinaryAsString() throws IOException {
    final String file = "example/compat/284a0e001476716b-56d5676f53bd6e85_115466471_data.0.parq";
    InputRowSchema schema = new InputRowSchema(new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("field"))), ColumnsFilter.all());
    InputEntityReader reader = createReader(file, schema, JSONPathSpec.DEFAULT, true);
    InputEntityReader readerNotAsString = createReader(file, schema, JSONPathSpec.DEFAULT, false);
    List<InputRow> rows = readAllRows(reader);
    List<InputRow> rowsAsBinary = readAllRows(readerNotAsString);
    Assert.assertEquals("hey this is &é(-è_çà)=^$ù*! Ω^^", rows.get(0).getDimension("field").get(0));
    Assert.assertEquals(1471800234, rows.get(0).getTimestampFromEpoch());
    Assert.assertEquals("aGV5IHRoaXMgaXMgJsOpKC3DqF/Dp8OgKT1eJMO5KiEgzqleXg==", rowsAsBinary.get(0).getDimension("field").get(0));
    Assert.assertEquals(1471800234, rowsAsBinary.get(0).getTimestampFromEpoch());
    reader = createReader(file, schema, JSONPathSpec.DEFAULT, true);
    readerNotAsString = createReader(file, schema, JSONPathSpec.DEFAULT, false);
    List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
    List<InputRowListPlusRawValues> sampledAsBinary = sampleAllRows(readerNotAsString);
    final String expectedJson = "{\n" + "  \"field\" : \"hey this is &é(-è_çà)=^$ù*! Ω^^\",\n" + "  \"ts\" : 1471800234\n" + "}";
    Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(0).getRawValues()));
    final String expectedJsonBinary = "{\n" + "  \"field\" : \"aGV5IHRoaXMgaXMgJsOpKC3DqF/Dp8OgKT1eJMO5KiEgzqleXg==\",\n" + "  \"ts\" : 1471800234\n" + "}";
    Assert.assertEquals(expectedJsonBinary, DEFAULT_JSON_WRITER.writeValueAsString(sampledAsBinary.get(0).getRawValues()));
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InputEntityReader(org.apache.druid.data.input.InputEntityReader) Test(org.junit.Test)

Example 49 with InputRowSchema

use of org.apache.druid.data.input.InputRowSchema in project druid by druid-io.

the class CompatParquetReaderTest method testParquetThriftCompat.

@Test
public void testParquetThriftCompat() throws IOException {
    /*
      message ParquetSchema {
        required boolean boolColumn;
        required int32 byteColumn;
        required int32 shortColumn;
        required int32 intColumn;
        required int64 longColumn;
        required double doubleColumn;
        required binary binaryColumn (UTF8);
        required binary stringColumn (UTF8);
        required binary enumColumn (ENUM);
        optional boolean maybeBoolColumn;
        optional int32 maybeByteColumn;
        optional int32 maybeShortColumn;
        optional int32 maybeIntColumn;
        optional int64 maybeLongColumn;
        optional double maybeDoubleColumn;
        optional binary maybeBinaryColumn (UTF8);
        optional binary maybeStringColumn (UTF8);
        optional binary maybeEnumColumn (ENUM);
        required group stringsColumn (LIST) {
          repeated binary stringsColumn_tuple (UTF8);
        }
        required group intSetColumn (LIST) {
          repeated int32 intSetColumn_tuple;
        }
        required group intToStringColumn (MAP) {
          repeated group map (MAP_KEY_VALUE) {
            required int32 key;
            optional binary value (UTF8);
          }
        }
        required group complexColumn (MAP) {
          repeated group map (MAP_KEY_VALUE) {
            required int32 key;
            optional group value (LIST) {
              repeated group value_tuple {
                required group nestedIntsColumn (LIST) {
                  repeated int32 nestedIntsColumn_tuple;
                }
                required binary nestedStringColumn (UTF8);
              }
            }
          }
        }
      }
     */
    final String file = "example/compat/parquet-thrift-compat.snappy.parquet";
    InputRowSchema schema = new InputRowSchema(new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), ColumnsFilter.all());
    List<JSONPathFieldSpec> flattenExpr = ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.PATH, "extractByLogicalMap", "$.intToStringColumn.1"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "extractByComplexLogicalMap", "$.complexColumn.1[0].nestedIntsColumn[1]"));
    JSONPathSpec flattenSpec = new JSONPathSpec(true, flattenExpr);
    InputEntityReader reader = createReader(file, schema, flattenSpec);
    List<InputRow> rows = readAllRows(reader);
    Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
    Assert.assertEquals("true", rows.get(0).getDimension("boolColumn").get(0));
    Assert.assertEquals("0", rows.get(0).getDimension("byteColumn").get(0));
    Assert.assertEquals("1", rows.get(0).getDimension("shortColumn").get(0));
    Assert.assertEquals("2", rows.get(0).getDimension("intColumn").get(0));
    Assert.assertEquals("0", rows.get(0).getDimension("longColumn").get(0));
    Assert.assertEquals("0.2", rows.get(0).getDimension("doubleColumn").get(0));
    Assert.assertEquals("val_0", rows.get(0).getDimension("binaryColumn").get(0));
    Assert.assertEquals("val_0", rows.get(0).getDimension("stringColumn").get(0));
    Assert.assertEquals("SPADES", rows.get(0).getDimension("enumColumn").get(0));
    Assert.assertTrue(rows.get(0).getDimension("maybeBoolColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeByteColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeShortColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeIntColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeLongColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeDoubleColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeBinaryColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeStringColumn").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("maybeEnumColumn").isEmpty());
    Assert.assertEquals("arr_0", rows.get(0).getDimension("stringsColumn").get(0));
    Assert.assertEquals("arr_1", rows.get(0).getDimension("stringsColumn").get(1));
    Assert.assertEquals("0", rows.get(0).getDimension("intSetColumn").get(0));
    Assert.assertEquals("val_1", rows.get(0).getDimension("extractByLogicalMap").get(0));
    Assert.assertEquals("1", rows.get(0).getDimension("extractByComplexLogicalMap").get(0));
    reader = createReader(file, schema, flattenSpec);
    List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
    final String expectedJson = "{\n" + "  \"enumColumn\" : \"SPADES\",\n" + "  \"maybeStringColumn\" : { },\n" + "  \"maybeBinaryColumn\" : { },\n" + "  \"shortColumn\" : 1,\n" + "  \"byteColumn\" : 0,\n" + "  \"maybeBoolColumn\" : { },\n" + "  \"intColumn\" : 2,\n" + "  \"doubleColumn\" : 0.2,\n" + "  \"maybeByteColumn\" : { },\n" + "  \"intSetColumn\" : [ 0 ],\n" + "  \"boolColumn\" : true,\n" + "  \"binaryColumn\" : \"val_0\",\n" + "  \"maybeIntColumn\" : { },\n" + "  \"intToStringColumn\" : {\n" + "    \"0\" : \"val_0\",\n" + "    \"1\" : \"val_1\",\n" + "    \"2\" : \"val_2\"\n" + "  },\n" + "  \"maybeDoubleColumn\" : { },\n" + "  \"maybeEnumColumn\" : { },\n" + "  \"maybeLongColumn\" : { },\n" + "  \"stringsColumn\" : [ \"arr_0\", \"arr_1\", \"arr_2\" ],\n" + "  \"longColumn\" : 0,\n" + "  \"stringColumn\" : \"val_0\",\n" + "  \"maybeShortColumn\" : { },\n" + "  \"complexColumn\" : {\n" + "    \"0\" : [ {\n" + "      \"nestedStringColumn\" : \"val_0\",\n" + "      \"nestedIntsColumn\" : [ 0, 1, 2 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_1\",\n" + "      \"nestedIntsColumn\" : [ 1, 2, 3 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_2\",\n" + "      \"nestedIntsColumn\" : [ 2, 3, 4 ]\n" + "    } ],\n" + "    \"1\" : [ {\n" + "      \"nestedStringColumn\" : \"val_0\",\n" + "      \"nestedIntsColumn\" : [ 0, 1, 2 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_1\",\n" + "      \"nestedIntsColumn\" : [ 1, 2, 3 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_2\",\n" + "      \"nestedIntsColumn\" : [ 2, 3, 4 ]\n" + "    } ],\n" + "    \"2\" : [ {\n" + "      \"nestedStringColumn\" : \"val_0\",\n" + "      \"nestedIntsColumn\" : [ 0, 1, 2 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_1\",\n" + "      \"nestedIntsColumn\" : [ 1, 2, 3 ]\n" + "    }, {\n" + "      \"nestedStringColumn\" : \"val_2\",\n" + "      \"nestedIntsColumn\" : [ 2, 3, 4 ]\n" + "    } ]\n" + "  }\n" + "}";
    Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(0).getRawValues()));
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InputEntityReader(org.apache.druid.data.input.InputEntityReader) Test(org.junit.Test)

Example 50 with InputRowSchema

use of org.apache.druid.data.input.InputRowSchema in project druid by druid-io.

the class CompatParquetReaderTest method testProtoStructWithArray.

@Test
public void testProtoStructWithArray() throws IOException {
    final String file = "example/compat/proto-struct-with-array.parquet";
    InputRowSchema schema = new InputRowSchema(new TimestampSpec("timestamp", "auto", DateTimes.of("2018-09-01T00:00:00.000Z")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), ColumnsFilter.all());
    List<JSONPathFieldSpec> flattenExpr = ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.PATH, "extractedOptional", "$.optionalMessage.someId"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "extractedRequired", "$.requiredMessage.someId"), new JSONPathFieldSpec(JSONPathFieldType.PATH, "extractedRepeated", "$.repeatedMessage[*]"));
    JSONPathSpec flattenSpec = new JSONPathSpec(true, flattenExpr);
    InputEntityReader reader = createReader(file, schema, flattenSpec);
    List<InputRow> rows = readAllRows(reader);
    Assert.assertEquals("2018-09-01T00:00:00.000Z", rows.get(0).getTimestamp().toString());
    Assert.assertEquals("10", rows.get(0).getDimension("optionalPrimitive").get(0));
    Assert.assertEquals("9", rows.get(0).getDimension("requiredPrimitive").get(0));
    Assert.assertTrue(rows.get(0).getDimension("repeatedPrimitive").isEmpty());
    Assert.assertTrue(rows.get(0).getDimension("extractedOptional").isEmpty());
    Assert.assertEquals("9", rows.get(0).getDimension("extractedRequired").get(0));
    Assert.assertEquals("9", rows.get(0).getDimension("extractedRepeated").get(0));
    Assert.assertEquals("10", rows.get(0).getDimension("extractedRepeated").get(1));
    reader = createReader(file, schema, flattenSpec);
    List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
    final String expectedJson = "{\n" + "  \"optionalMessage\" : { },\n" + "  \"requiredPrimitive\" : 9,\n" + "  \"repeatedPrimitive\" : { },\n" + "  \"repeatedMessage\" : [ 9, 10 ],\n" + "  \"optionalPrimitive\" : 10,\n" + "  \"requiredMessage\" : {\n" + "    \"someId\" : 9\n" + "  }\n" + "}";
    Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(0).getRawValues()));
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InputEntityReader(org.apache.druid.data.input.InputEntityReader) Test(org.junit.Test)

Aggregations

InputRowSchema (org.apache.druid.data.input.InputRowSchema)63 Test (org.junit.Test)55 InputRow (org.apache.druid.data.input.InputRow)52 InputEntityReader (org.apache.druid.data.input.InputEntityReader)39 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)37 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)36 JSONPathSpec (org.apache.druid.java.util.common.parsers.JSONPathSpec)29 JSONPathFieldSpec (org.apache.druid.java.util.common.parsers.JSONPathFieldSpec)26 InputRowListPlusRawValues (org.apache.druid.data.input.InputRowListPlusRawValues)24 InputSourceReader (org.apache.druid.data.input.InputSourceReader)10 ByteEntity (org.apache.druid.data.input.impl.ByteEntity)9 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)9 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)9 File (java.io.File)7 KafkaRecordEntity (org.apache.druid.data.input.kafka.KafkaRecordEntity)5 ArrayList (java.util.ArrayList)4 Collections (java.util.Collections)4 List (java.util.List)4 Map (java.util.Map)4 Nullable (javax.annotation.Nullable)4