Search in sources :

Example 1 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class LocalFirehoseFactoryTest method testConnect.

@Test
public void testConnect() throws IOException {
    try (final Firehose firehose = factory.connect(new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("timestamp", "a"))), ",", Arrays.asList("timestamp", "a"), false, 0), StandardCharsets.UTF_8.name()), null)) {
        final List<Row> rows = new ArrayList<>();
        while (firehose.hasMore()) {
            rows.add(firehose.nextRow());
        }
        Assert.assertEquals(5, rows.size());
        rows.sort(Comparator.comparing(Row::getTimestamp));
        for (int i = 0; i < 5; i++) {
            final List<String> dimVals = rows.get(i).getDimension("a");
            Assert.assertEquals(1, dimVals.size());
            Assert.assertEquals(i + "th test file", dimVals.get(0));
        }
    }
}
Also used : Firehose(org.apache.druid.data.input.Firehose) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) ArrayList(java.util.ArrayList) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Row(org.apache.druid.data.input.Row) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 2 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class MultiValuedDimensionTest method setup.

@Before
public void setup() throws Exception {
    incrementalIndex = new OnheapIncrementalIndex.Builder().setSimpleTestingIndexSchema(new CountAggregatorFactory("count")).setMaxRowCount(5000).build();
    StringInputRowParser parser = new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("product", "tags", "othertags"))), "\t", ImmutableList.of("timestamp", "product", "tags", "othertags"), false, 0), "UTF-8");
    String[] rows = new String[] { "2011-01-12T00:00:00.000Z,product_1,t1\tt2\tt3,u1\tu2", "2011-01-13T00:00:00.000Z,product_2,t3\tt4\tt5,u3\tu4", "2011-01-14T00:00:00.000Z,product_3,t5\tt6\tt7,u1\tu5", "2011-01-14T00:00:00.000Z,product_4,\"\",u2" };
    for (String row : rows) {
        incrementalIndex.add(parser.parse(row));
    }
    persistedSegmentDir = FileUtils.createTempDir();
    TestHelper.getTestIndexMergerV9(segmentWriteOutMediumFactory).persist(incrementalIndex, persistedSegmentDir, new IndexSpec(), null);
    queryableIndex = TestHelper.getTestIndexIO().loadIndex(persistedSegmentDir);
    StringInputRowParser parserNullSampler = new StringInputRowParser(new JSONParseSpec(new TimestampSpec("time", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("product", "tags", "othertags")))), "UTF-8");
    incrementalIndexNullSampler = new OnheapIncrementalIndex.Builder().setSimpleTestingIndexSchema(new CountAggregatorFactory("count")).setMaxRowCount(5000).build();
    String[] rowsNullSampler = new String[] { "{\"time\":\"2011-01-13T00:00:00.000Z\",\"product\":\"product_1\",\"tags\":[],\"othertags\":[\"u1\", \"u2\"]}", "{\"time\":\"2011-01-12T00:00:00.000Z\",\"product\":\"product_2\",\"othertags\":[\"u3\", \"u4\"]}", "{\"time\":\"2011-01-14T00:00:00.000Z\",\"product\":\"product_3\",\"tags\":[\"\"],\"othertags\":[\"u1\", \"u5\"]}", "{\"time\":\"2011-01-15T00:00:00.000Z\",\"product\":\"product_4\",\"tags\":[\"t1\", \"t2\", \"\"],\"othertags\":[\"u6\", \"u7\"]}", "{\"time\":\"2011-01-16T00:00:00.000Z\",\"product\":\"product_5\",\"tags\":[],\"othertags\":[]}", "{\"time\":\"2011-01-16T00:00:00.000Z\",\"product\":\"product_6\"}", "{\"time\":\"2011-01-16T00:00:00.000Z\",\"product\":\"product_7\",\"othertags\":[]}", "{\"time\":\"2011-01-16T00:00:00.000Z\",\"product\":\"product_8\",\"tags\":[\"\"],\"othertags\":[]}" };
    for (String row : rowsNullSampler) {
        incrementalIndexNullSampler.add(parserNullSampler.parse(row));
    }
    persistedSegmentDirNullSampler = FileUtils.createTempDir();
    TestHelper.getTestIndexMergerV9(segmentWriteOutMediumFactory).persist(incrementalIndexNullSampler, persistedSegmentDirNullSampler, new IndexSpec(), null);
    queryableIndexNullSampler = TestHelper.getTestIndexIO().loadIndex(persistedSegmentDirNullSampler);
}
Also used : IndexSpec(org.apache.druid.segment.IndexSpec) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) TopNQueryBuilder(org.apache.druid.query.topn.TopNQueryBuilder) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) JSONParseSpec(org.apache.druid.data.input.impl.JSONParseSpec) Before(org.junit.Before)

Example 3 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class BatchDeltaIngestionTest method makeHadoopDruidIndexerConfig.

private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map<String, Object> inputSpec, File tmpDir, AggregatorFactory[] aggregators) throws Exception {
    HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema("website", MAPPER.convertValue(new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host"))), null, ImmutableList.of("timestamp", "host", "host2", "visited_num"), false, 0), null), Map.class), aggregators != null ? aggregators : new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum", "visited_num"), new HyperUniquesAggregatorFactory("unique_hosts", "host2") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(INTERVAL_FULL)), null, MAPPER), new HadoopIOConfig(inputSpec, null, tmpDir.getCanonicalPath()), new HadoopTuningConfig(tmpDir.getCanonicalPath(), null, null, null, null, null, null, null, null, false, false, false, false, null, false, false, null, null, false, false, null, null, null, null, null)));
    config.setShardSpecs(ImmutableMap.of(INTERVAL_FULL.getStartMillis(), ImmutableList.of(new HadoopyShardSpec(new HashBasedNumberedShardSpec(0, 1, 0, 1, null, HashPartitionFunction.MURMUR3_32_ABS, HadoopDruidIndexerConfig.JSON_MAPPER), 0))));
    config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
    return config;
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) HyperUniquesAggregatorFactory(org.apache.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) HyperUniquesAggregatorFactory(org.apache.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 4 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class IndexTaskTest method testCSVFileWithHeaderColumnOverride.

@Test
public void testCSVFileWithHeaderColumnOverride() throws Exception {
    File tmpDir = temporaryFolder.newFolder();
    File tmpFile = File.createTempFile("druid", "index", tmpDir);
    try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
        writer.write("time,d,val\n");
        writer.write("2014-01-01T00:00:10Z,a,1\n");
    }
    final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
    final List<String> columns = Arrays.asList("time", "dim", "val");
    final IndexTuningConfig tuningConfig = createTuningConfigWithMaxRowsPerSegment(2, true);
    final IndexIngestionSpec ingestionSpec;
    if (useInputFormatApi) {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, DimensionsSpec.EMPTY, null, columns, true, 0), null, null, tuningConfig, false, false);
    } else {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, DimensionsSpec.EMPTY, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
    }
    IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
    final List<DataSegment> segments = runTask(indexTask).rhs;
    Assert.assertEquals(1, segments.size());
    Assert.assertEquals(Collections.singletonList("d"), segments.get(0).getDimensions());
    Assert.assertEquals(Collections.singletonList("val"), segments.get(0).getMetrics());
    Assert.assertEquals(Intervals.of("2014/P1D"), segments.get(0).getInterval());
}
Also used : IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) File(java.io.File) DataSegment(org.apache.druid.timeline.DataSegment) BufferedWriter(java.io.BufferedWriter) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) Test(org.junit.Test)

Example 5 with CSVParseSpec

use of org.apache.druid.data.input.impl.CSVParseSpec in project druid by druid-io.

the class IndexTaskTest method testCSVFileWithHeader.

@Test
public void testCSVFileWithHeader() throws Exception {
    File tmpDir = temporaryFolder.newFolder();
    File tmpFile = File.createTempFile("druid", "index", tmpDir);
    try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
        writer.write("time,d,val\n");
        writer.write("2014-01-01T00:00:10Z,a,1\n");
    }
    final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
    final IndexTuningConfig tuningConfig = createTuningConfigWithMaxRowsPerSegment(2, true);
    final IndexIngestionSpec ingestionSpec;
    if (useInputFormatApi) {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, DimensionsSpec.EMPTY, null, null, true, 0), null, null, tuningConfig, false, false);
    } else {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, DimensionsSpec.EMPTY, new CsvInputFormat(null, null, null, true, 0), null, null, tuningConfig, false, false);
    }
    IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
    final List<DataSegment> segments = runTask(indexTask).rhs;
    Assert.assertEquals(1, segments.size());
    Assert.assertEquals(Collections.singletonList("d"), segments.get(0).getDimensions());
    Assert.assertEquals(Collections.singletonList("val"), segments.get(0).getMetrics());
    Assert.assertEquals(Intervals.of("2014/P1D"), segments.get(0).getInterval());
}
Also used : IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) File(java.io.File) DataSegment(org.apache.druid.timeline.DataSegment) BufferedWriter(java.io.BufferedWriter) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) Test(org.junit.Test)

Aggregations

CSVParseSpec (org.apache.druid.data.input.impl.CSVParseSpec)16 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)15 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)12 StringInputRowParser (org.apache.druid.data.input.impl.StringInputRowParser)12 Test (org.junit.Test)11 BufferedWriter (java.io.BufferedWriter)9 File (java.io.File)9 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)9 IndexIngestionSpec (org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec)9 IndexTuningConfig (org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig)9 DataSegment (org.apache.druid.timeline.DataSegment)9 ArrayList (java.util.ArrayList)8 JSONParseSpec (org.apache.druid.data.input.impl.JSONParseSpec)7 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)7 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)7 DataSchema (org.apache.druid.segment.indexing.DataSchema)7 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)7 Before (org.junit.Before)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 HashSet (java.util.HashSet)6