Search in sources :

Example 31 with DimensionsSpec

use of org.apache.druid.data.input.impl.DimensionsSpec in project druid by druid-io.

the class FlattenSpecParquetReaderTest method testFlat1Flatten.

@Test
public void testFlat1Flatten() throws IOException {
    final String file = "example/flattening/test_flat_1.parquet";
    InputRowSchema schema = new InputRowSchema(new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2", "dim3", "list"))), ColumnsFilter.all());
    List<JSONPathFieldSpec> flattenExpr = ImmutableList.of(new JSONPathFieldSpec(JSONPathFieldType.ROOT, "timestamp", null), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "dim1", null), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "dim2", null), new JSONPathFieldSpec(JSONPathFieldType.ROOT, "dim3", null), new JSONPathFieldSpec(JSONPathFieldType.PATH, "list", "$.listDim"));
    JSONPathSpec flattenSpec = new JSONPathSpec(false, flattenExpr);
    InputEntityReader reader = createReader(file, schema, flattenSpec);
    List<InputRow> rows = readAllRows(reader);
    Assert.assertEquals(FlattenSpecParquetInputTest.TS1, rows.get(0).getTimestamp().toString());
    Assert.assertEquals("d1v1", rows.get(0).getDimension("dim1").get(0));
    Assert.assertEquals("d2v1", rows.get(0).getDimension("dim2").get(0));
    Assert.assertEquals("1", rows.get(0).getDimension("dim3").get(0));
    Assert.assertEquals("listDim1v1", rows.get(0).getDimension("list").get(0));
    Assert.assertEquals("listDim1v2", rows.get(0).getDimension("list").get(1));
    Assert.assertEquals(1, rows.get(0).getMetric("metric1").longValue());
    reader = createReader(file, schema, flattenSpec);
    List<InputRowListPlusRawValues> sampled = sampleAllRows(reader);
    Assert.assertEquals(FLAT_JSON, DEFAULT_JSON_WRITER.writeValueAsString(sampled.get(0).getRawValues()));
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) JSONPathFieldSpec(org.apache.druid.java.util.common.parsers.JSONPathFieldSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InputEntityReader(org.apache.druid.data.input.InputEntityReader) Test(org.junit.Test)

Example 32 with DimensionsSpec

use of org.apache.druid.data.input.impl.DimensionsSpec in project druid by druid-io.

the class TimestampsParquetReaderTest method testDateHandling.

@Test
public void testDateHandling() throws IOException {
    final String file = "example/timestamps/test_date_data.snappy.parquet";
    InputRowSchema schemaAsString = new InputRowSchema(new TimestampSpec("date_as_string", "Y-M-d", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), ColumnsFilter.all());
    InputRowSchema schemaAsDate = new InputRowSchema(new TimestampSpec("date_as_date", null, null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of())), ColumnsFilter.all());
    InputEntityReader readerAsString = createReader(file, schemaAsString, JSONPathSpec.DEFAULT);
    InputEntityReader readerAsDate = createReader(file, schemaAsDate, JSONPathSpec.DEFAULT);
    List<InputRow> rowsWithString = readAllRows(readerAsString);
    List<InputRow> rowsWithDate = readAllRows(readerAsDate);
    Assert.assertEquals(rowsWithDate.size(), rowsWithString.size());
    for (int i = 0; i < rowsWithDate.size(); i++) {
        Assert.assertEquals(rowsWithString.get(i).getTimestamp(), rowsWithDate.get(i).getTimestamp());
    }
    readerAsString = createReader(file, schemaAsString, JSONPathSpec.DEFAULT);
    readerAsDate = createReader(file, schemaAsDate, JSONPathSpec.DEFAULT);
    List<InputRowListPlusRawValues> sampledAsString = sampleAllRows(readerAsString);
    List<InputRowListPlusRawValues> sampledAsDate = sampleAllRows(readerAsDate);
    final String expectedJson = "{\n" + "  \"date_as_string\" : \"2017-06-18\",\n" + "  \"timestamp_as_timestamp\" : 1497702471815,\n" + "  \"timestamp_as_string\" : \"2017-06-17 14:27:51.815\",\n" + "  \"idx\" : 1,\n" + "  \"date_as_date\" : 1497744000000\n" + "}";
    Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampledAsString.get(0).getRawValues()));
    Assert.assertEquals(expectedJson, DEFAULT_JSON_WRITER.writeValueAsString(sampledAsDate.get(0).getRawValues()));
}
Also used : InputRowListPlusRawValues(org.apache.druid.data.input.InputRowListPlusRawValues) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InputEntityReader(org.apache.druid.data.input.InputEntityReader) Test(org.junit.Test)

Example 33 with DimensionsSpec

use of org.apache.druid.data.input.impl.DimensionsSpec in project druid by druid-io.

the class S3InputSourceTest method testReader.

@Test
public void testReader() throws IOException {
    EasyMock.reset(S3_CLIENT);
    expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)), CONTENT);
    expectListObjects(EXPECTED_URIS.get(1), ImmutableList.of(EXPECTED_URIS.get(1)), CONTENT);
    expectGetObject(EXPECTED_URIS.get(0));
    expectGetObject(EXPECTED_URIS.get(1));
    EasyMock.replay(S3_CLIENT);
    S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, null, ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)), null, null);
    InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
    InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), temporaryFolder.newFolder());
    CloseableIterator<InputRow> iterator = reader.read();
    while (iterator.hasNext()) {
        InputRow nextRow = iterator.next();
        Assert.assertEquals(NOW, nextRow.getTimestamp());
        Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
        Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
    }
    EasyMock.verify(S3_CLIENT);
}
Also used : InputSourceReader(org.apache.druid.data.input.InputSourceReader) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 34 with DimensionsSpec

use of org.apache.druid.data.input.impl.DimensionsSpec in project druid by druid-io.

the class S3InputSourceTest method testReaderRetriesOnSdkClientExceptionButNeverSucceedsThenThrows.

@Test(expected = SdkClientException.class)
public void testReaderRetriesOnSdkClientExceptionButNeverSucceedsThenThrows() throws Exception {
    EasyMock.reset(S3_CLIENT);
    expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)), CONTENT);
    expectSdkClientException(EXPECTED_URIS.get(0));
    EasyMock.replay(S3_CLIENT);
    S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, null, ImmutableList.of(PREFIXES.get(0)), null, null, // only have three retries since they are slow
    3);
    InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
    InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), temporaryFolder.newFolder());
    CloseableIterator<InputRow> iterator = reader.read();
    while (iterator.hasNext()) {
        InputRow nextRow = iterator.next();
        Assert.assertEquals(NOW, nextRow.getTimestamp());
        Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
        Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
    }
    EasyMock.verify(S3_CLIENT);
}
Also used : InputSourceReader(org.apache.druid.data.input.InputSourceReader) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 35 with DimensionsSpec

use of org.apache.druid.data.input.impl.DimensionsSpec in project druid by druid-io.

the class BatchDeltaIngestionTest method makeHadoopDruidIndexerConfig.

private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map<String, Object> inputSpec, File tmpDir, AggregatorFactory[] aggregators) throws Exception {
    HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema("website", MAPPER.convertValue(new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "host2", "visited_num"), false, 0), null), Map.class), aggregators != null ? aggregators : new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum", "visited_num"), new HyperUniquesAggregatorFactory("unique_hosts", "host2") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(INTERVAL_FULL)), null, MAPPER), new HadoopIOConfig(inputSpec, null, tmpDir.getCanonicalPath()), new HadoopTuningConfig(tmpDir.getCanonicalPath(), null, null, null, null, null, null, null, null, false, false, false, false, null, false, false, null, null, false, false, null, null, null, null, null)));
    config.setShardSpecs(ImmutableMap.of(INTERVAL_FULL.getStartMillis(), ImmutableList.of(new HadoopyShardSpec(new HashBasedNumberedShardSpec(0, 1, 0, 1, null, HashPartitionFunction.MURMUR3_32_ABS, HadoopDruidIndexerConfig.JSON_MAPPER), 0))));
    config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
    return config;
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) HyperUniquesAggregatorFactory(org.apache.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) HyperUniquesAggregatorFactory(org.apache.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Aggregations

DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)169 Test (org.junit.Test)129 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)114 InputRow (org.apache.druid.data.input.InputRow)52 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)47 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)47 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)42 DataSchema (org.apache.druid.segment.indexing.DataSchema)39 StringDimensionSchema (org.apache.druid.data.input.impl.StringDimensionSchema)37 CountAggregatorFactory (org.apache.druid.query.aggregation.CountAggregatorFactory)37 InputRowSchema (org.apache.druid.data.input.InputRowSchema)36 Map (java.util.Map)32 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)32 InputEntityReader (org.apache.druid.data.input.InputEntityReader)31 ArrayList (java.util.ArrayList)29 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)25 MapBasedInputRow (org.apache.druid.data.input.MapBasedInputRow)24 JSONPathSpec (org.apache.druid.java.util.common.parsers.JSONPathSpec)24 HashMap (java.util.HashMap)23 ImmutableMap (com.google.common.collect.ImmutableMap)21