use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class OssInputSourceTest method testCompressedReader.
@Test
public void testCompressedReader() throws IOException {
EasyMock.reset(OSSCLIENT);
expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_COMPRESSED_URIS.get(0)), CONTENT);
expectListObjects(EXPECTED_COMPRESSED_URIS.get(1), ImmutableList.of(EXPECTED_COMPRESSED_URIS.get(1)), CONTENT);
expectGetObjectCompressed(EXPECTED_COMPRESSED_URIS.get(0));
expectGetObjectCompressed(EXPECTED_COMPRESSED_URIS.get(1));
EasyMock.replay(OSSCLIENT);
OssInputSource inputSource = new OssInputSource(OSSCLIENT, INPUT_DATA_CONFIG, null, ImmutableList.of(PREFIXES.get(0), EXPECTED_COMPRESSED_URIS.get(1)), null, null);
InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), temporaryFolder.newFolder());
CloseableIterator<InputRow> iterator = reader.read();
while (iterator.hasNext()) {
InputRow nextRow = iterator.next();
Assert.assertEquals(NOW, nextRow.getTimestamp());
Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
}
EasyMock.verify(OSSCLIENT);
}
use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class GoogleCloudStorageInputSourceTest method testReader.
@Test
public void testReader() throws IOException {
EasyMock.reset(STORAGE);
EasyMock.reset(INPUT_DATA_CONFIG);
addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)));
addExpectedGetObjectMock(EXPECTED_URIS.get(0));
addExpectedPrefixObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1)));
addExpectedGetObjectMock(EXPECTED_URIS.get(1));
EasyMock.expect(INPUT_DATA_CONFIG.getMaxListingLength()).andReturn(MAX_LISTING_LENGTH);
EasyMock.replay(STORAGE);
EasyMock.replay(INPUT_DATA_CONFIG);
GoogleCloudStorageInputSource inputSource = new GoogleCloudStorageInputSource(STORAGE, INPUT_DATA_CONFIG, null, PREFIXES, null);
InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), null);
CloseableIterator<InputRow> iterator = reader.read();
while (iterator.hasNext()) {
InputRow nextRow = iterator.next();
Assert.assertEquals(NOW, nextRow.getTimestamp());
Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
}
}
use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class S3InputSourceTest method testCompressedReader.
@Test
public void testCompressedReader() throws IOException {
EasyMock.reset(S3_CLIENT);
expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_COMPRESSED_URIS.get(0)), CONTENT);
expectListObjects(EXPECTED_COMPRESSED_URIS.get(1), ImmutableList.of(EXPECTED_COMPRESSED_URIS.get(1)), CONTENT);
expectGetObjectCompressed(EXPECTED_COMPRESSED_URIS.get(0));
expectGetObjectCompressed(EXPECTED_COMPRESSED_URIS.get(1));
EasyMock.replay(S3_CLIENT);
S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, null, ImmutableList.of(PREFIXES.get(0), EXPECTED_COMPRESSED_URIS.get(1)), null, null);
InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), temporaryFolder.newFolder());
CloseableIterator<InputRow> iterator = reader.read();
while (iterator.hasNext()) {
InputRow nextRow = iterator.next();
Assert.assertEquals(NOW, nextRow.getTimestamp());
Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
}
EasyMock.verify(S3_CLIENT);
}
use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class IndexTaskTest method testCsvWithHeaderOfEmptyTimestamp.
@Test
public void testCsvWithHeaderOfEmptyTimestamp() throws Exception {
final File tmpDir = temporaryFolder.newFolder();
final File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
writer.write(",,\n");
writer.write("2014-01-01T00:00:10Z,a,1\n");
}
final List<String> columns = Arrays.asList("ts", "", "");
// report parse exception
final IndexTuningConfig tuningConfig = createTuningConfig(2, null, null, null, null, false, true);
final IndexIngestionSpec ingestionSpec;
List<String> expectedMessages;
if (useInputFormatApi) {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, DEFAULT_TIMESTAMP_SPEC, DimensionsSpec.EMPTY, new CsvInputFormat(columns, null, null, true, 0), null, null, tuningConfig, false, false);
expectedMessages = ImmutableList.of(StringUtils.format("Timestamp[null] is unparseable! Event: {column_1=2014-01-01T00:00:10Z, column_2=a, column_3=1} (Path: %s, Record: 1, Line: 2)", tmpFile.toURI()));
} else {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(DEFAULT_TIMESTAMP_SPEC, DimensionsSpec.EMPTY, null, columns, true, 0), null, null, tuningConfig, false, false);
expectedMessages = ImmutableList.of("Timestamp[null] is unparseable! Event: {column_1=2014-01-01T00:00:10Z, column_2=a, column_3=1}");
}
IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
TaskStatus status = runTask(indexTask).lhs;
Assert.assertEquals(TaskState.FAILED, status.getStatusCode());
checkTaskStatusErrorMsgForParseExceptionsExceeded(status);
IngestionStatsAndErrorsTaskReportData reportData = getTaskReportData();
List<LinkedHashMap> parseExceptionReports = (List<LinkedHashMap>) reportData.getUnparseableEvents().get(RowIngestionMeters.BUILD_SEGMENTS);
List<String> actualMessages = parseExceptionReports.stream().map((r) -> {
return ((List<String>) r.get("details")).get(0);
}).collect(Collectors.toList());
Assert.assertEquals(expectedMessages, actualMessages);
List<String> expectedInputs = ImmutableList.of("{column_1=2014-01-01T00:00:10Z, column_2=a, column_3=1}");
List<String> actualInputs = parseExceptionReports.stream().map((r) -> {
return (String) r.get("input");
}).collect(Collectors.toList());
Assert.assertEquals(expectedInputs, actualInputs);
}
use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class IndexTaskTest method testCsvWithHeaderOfEmptyColumns.
@Test
public void testCsvWithHeaderOfEmptyColumns() throws Exception {
final File tmpDir = temporaryFolder.newFolder();
File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
writer.write("ts,,\n");
writer.write("2014-01-01T00:00:10Z,a,1\n");
}
tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
writer.write("ts,dim,\n");
writer.write("2014-01-01T00:00:10Z,a,1\n");
}
tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
writer.write("ts,,val\n");
writer.write("2014-01-01T00:00:10Z,a,1\n");
}
// report parse exception
final IndexTuningConfig tuningConfig = createTuningConfig(2, 1, null, null, null, true, true);
final IndexIngestionSpec ingestionSpec;
if (useInputFormatApi) {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, DEFAULT_TIMESTAMP_SPEC, DimensionsSpec.EMPTY, new CsvInputFormat(null, null, null, true, 0), null, null, tuningConfig, false, false);
} else {
ingestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(DEFAULT_TIMESTAMP_SPEC, DimensionsSpec.EMPTY, null, null, true, 0), null, null, tuningConfig, false, false);
}
IndexTask indexTask = new IndexTask(null, null, ingestionSpec, null);
final List<DataSegment> segments = runTask(indexTask).rhs;
// the order of result segments can be changed because hash shardSpec is used.
// the below loop is to make this test deterministic.
Assert.assertEquals(2, segments.size());
Assert.assertNotEquals(segments.get(0), segments.get(1));
for (DataSegment segment : segments) {
System.out.println(segment.getDimensions());
}
for (int i = 0; i < 2; i++) {
final DataSegment segment = segments.get(i);
final Set<String> dimensions = new HashSet<>(segment.getDimensions());
Assert.assertTrue(StringUtils.format("Actual dimensions: %s", dimensions), dimensions.equals(Sets.newHashSet("column_2")) || dimensions.equals(Sets.newHashSet("dim", "column_2", "column_3")));
Assert.assertEquals(Collections.singletonList("val"), segment.getMetrics());
Assert.assertEquals(Intervals.of("2014/P1D"), segment.getInterval());
}
}
Aggregations