Search in sources :

Example 1 with CsvInputFormat

use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.

the class S3InputSourceTest method testReader.

@Test
public void testReader() throws IOException {
    EasyMock.reset(S3_CLIENT);
    expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)), CONTENT);
    expectListObjects(EXPECTED_URIS.get(1), ImmutableList.of(EXPECTED_URIS.get(1)), CONTENT);
    expectGetObject(EXPECTED_URIS.get(0));
    expectGetObject(EXPECTED_URIS.get(1));
    EasyMock.replay(S3_CLIENT);
    S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, null, ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)), null, null);
    InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
    InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), temporaryFolder.newFolder());
    CloseableIterator<InputRow> iterator = reader.read();
    while (iterator.hasNext()) {
        InputRow nextRow = iterator.next();
        Assert.assertEquals(NOW, nextRow.getTimestamp());
        Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
        Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
    }
    EasyMock.verify(S3_CLIENT);
}
Also used : InputSourceReader(org.apache.druid.data.input.InputSourceReader) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 2 with CsvInputFormat

use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.

the class S3InputSourceTest method testReaderRetriesOnSdkClientExceptionButNeverSucceedsThenThrows.

@Test(expected = SdkClientException.class)
public void testReaderRetriesOnSdkClientExceptionButNeverSucceedsThenThrows() throws Exception {
    EasyMock.reset(S3_CLIENT);
    expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)), CONTENT);
    expectSdkClientException(EXPECTED_URIS.get(0));
    EasyMock.replay(S3_CLIENT);
    S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, null, ImmutableList.of(PREFIXES.get(0)), null, null, // only have three retries since they are slow
    3);
    InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
    InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), temporaryFolder.newFolder());
    CloseableIterator<InputRow> iterator = reader.read();
    while (iterator.hasNext()) {
        InputRow nextRow = iterator.next();
        Assert.assertEquals(NOW, nextRow.getTimestamp());
        Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
        Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
    }
    EasyMock.verify(S3_CLIENT);
}
Also used : InputSourceReader(org.apache.druid.data.input.InputSourceReader) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) InputRow(org.apache.druid.data.input.InputRow) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) InputRowSchema(org.apache.druid.data.input.InputRowSchema) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 3 with CsvInputFormat

use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.

the class CompactionTaskParallelRunTest method runIndexTask.

private void runIndexTask(@Nullable PartitionsSpec partitionsSpec, boolean appendToExisting) {
    ParallelIndexIOConfig ioConfig = new ParallelIndexIOConfig(null, new LocalInputSource(inputDir, "druid*"), new CsvInputFormat(Arrays.asList("ts", "dim", "val"), "|", null, false, 0), appendToExisting, null);
    ParallelIndexTuningConfig tuningConfig = newTuningConfig(partitionsSpec, 2, !appendToExisting);
    ParallelIndexSupervisorTask indexTask = new ParallelIndexSupervisorTask(null, null, null, new ParallelIndexIngestionSpec(new DataSchema(DATA_SOURCE, new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "dim"))), new AggregatorFactory[] { new LongSumAggregatorFactory("val", "val") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, ImmutableList.of(INTERVAL_TO_INDEX)), null), ioConfig, tuningConfig), null);
    runTask(indexTask);
}
Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) ParallelIndexIOConfig(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIOConfig) ParallelIndexSupervisorTask(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexSupervisorTask) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) ParallelIndexIngestionSpec(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) ParallelIndexTuningConfig(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTuningConfig) LocalInputSource(org.apache.druid.data.input.impl.LocalInputSource)

Example 4 with CsvInputFormat

use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.

the class IndexTaskTest method testCSVFileWithHeaderColumnOverride.

@Test
public void testCSVFileWithHeaderColumnOverride() throws Exception {
    File tmpDir = temporaryFolder.newFolder();
    File tmpFile = File.createTempFile("druid", "index", tmpDir);
    try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
        writer.write("time,d,val\n");
        writer.write("2014-01-01T00:00:10Z,a,1\n");
    }
    final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
    final List<String> columns = Arrays.asList("time", "dim", "val");
    final IndexTuningConfig tuningConfig = createTuningConfigWithMaxRowsPerSegment(2, true);
    final IndexIngestionSpec ingestionSpec;
    if (useInputFormatApi) {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, DimensionsSpec.EMPTY, null, columns, true, 0), null, null, tuningConfig, false, false);
    } else {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, DimensionsSpec.EMPTY, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
    }
    IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
    final List<DataSegment> segments = runTask(indexTask).rhs;
    Assert.assertEquals(1, segments.size());
    Assert.assertEquals(Collections.singletonList("d"), segments.get(0).getDimensions());
    Assert.assertEquals(Collections.singletonList("val"), segments.get(0).getMetrics());
    Assert.assertEquals(Intervals.of("2014/P1D"), segments.get(0).getInterval());
}
Also used : IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) File(java.io.File) DataSegment(org.apache.druid.timeline.DataSegment) BufferedWriter(java.io.BufferedWriter) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) Test(org.junit.Test)

Example 5 with CsvInputFormat

use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.

the class IndexTaskTest method testCSVFileWithHeader.

@Test
public void testCSVFileWithHeader() throws Exception {
    File tmpDir = temporaryFolder.newFolder();
    File tmpFile = File.createTempFile("druid", "index", tmpDir);
    try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
        writer.write("time,d,val\n");
        writer.write("2014-01-01T00:00:10Z,a,1\n");
    }
    final TimestampSpec timestampSpec = new TimestampSpec("time", "auto", null);
    final IndexTuningConfig tuningConfig = createTuningConfigWithMaxRowsPerSegment(2, true);
    final IndexIngestionSpec ingestionSpec;
    if (useInputFormatApi) {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(timestampSpec, DimensionsSpec.EMPTY, null, null, true, 0), null, null, tuningConfig, false, false);
    } else {
        ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, timestampSpec, DimensionsSpec.EMPTY, new CsvInputFormat(null, null, null, true, 0), null, null, tuningConfig, false, false);
    }
    IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
    final List<DataSegment> segments = runTask(indexTask).rhs;
    Assert.assertEquals(1, segments.size());
    Assert.assertEquals(Collections.singletonList("d"), segments.get(0).getDimensions());
    Assert.assertEquals(Collections.singletonList("val"), segments.get(0).getMetrics());
    Assert.assertEquals(Intervals.of("2014/P1D"), segments.get(0).getInterval());
}
Also used : IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) File(java.io.File) DataSegment(org.apache.druid.timeline.DataSegment) BufferedWriter(java.io.BufferedWriter) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) Test(org.junit.Test)

Aggregations

CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)19 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)18 Test (org.junit.Test)18 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)15 BufferedWriter (java.io.BufferedWriter)9 File (java.io.File)9 CSVParseSpec (org.apache.druid.data.input.impl.CSVParseSpec)9 IndexIngestionSpec (org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec)9 IndexTuningConfig (org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig)9 DataSegment (org.apache.druid.timeline.DataSegment)9 InputFormat (org.apache.druid.data.input.InputFormat)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 Collections (java.util.Collections)6 List (java.util.List)6 Map (java.util.Map)6 Set (java.util.Set)6 Collectors (java.util.stream.Collectors)6 Nullable (javax.annotation.Nullable)6 LocalInputSource (org.apache.druid.data.input.impl.LocalInputSource)6