use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class IndexTaskTest method testTransformSpec.
@Test
public void testTransformSpec() throws Exception {
File tmpDir = temporaryFolder.newFolder();
File tmpFile = File.createTempFile("druid", "index", tmpDir);
try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
writer.write("2014-01-01T00:00:10Z,a,an|array,1|2|3,1\n");
writer.write("2014-01-01T01:00:20Z,b,another|array,3|4,1\n");
writer.write("2014-01-01T02:00:30Z,c,and|another,0|1,1\n");
}
final DimensionsSpec dimensionsSpec = new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "dim", "dim_array", "dim_num_array", "dimt", "dimtarray1", "dimtarray2", "dimtnum_array")));
final List<String> columns = Arrays.asList("ts", "dim", "dim_array", "dim_num_array", "val");
final String listDelimiter = "|";
final TransformSpec transformSpec = new TransformSpec(new SelectorDimFilter("dim", "b", null), ImmutableList.of(new ExpressionTransform("dimt", "concat(dim,dim)", ExprMacroTable.nil()), new ExpressionTransform("dimtarray1", "array(dim, dim)", ExprMacroTable.nil()), new ExpressionTransform("dimtarray2", "map(d -> concat(d, 'foo'), dim_array)", ExprMacroTable.nil()), new ExpressionTransform("dimtnum_array", "map(d -> d + 3, dim_num_array)", ExprMacroTable.nil())));
final IndexTuningConfig tuningConfig = createTuningConfigWithMaxRowsPerSegment(2, false);
final IndexIngestionSpec indexIngestionSpec;
if (useInputFormatApi) {
indexIngestionSpec = createIngestionSpec(jsonMapper, tmpDir, DEFAULT_TIMESTAMP_SPEC, dimensionsSpec, new CsvInputFormat(columns, listDelimiter, null, false, 0), transformSpec, null, tuningConfig, false, false);
} else {
indexIngestionSpec = createIngestionSpec(jsonMapper, tmpDir, new CSVParseSpec(DEFAULT_TIMESTAMP_SPEC, dimensionsSpec, listDelimiter, columns, false, 0), transformSpec, null, tuningConfig, false, false);
}
IndexTask indexTask = new IndexTask(null, null, indexIngestionSpec, null);
Assert.assertEquals(indexTask.getId(), indexTask.getGroupId());
final List<DataSegment> segments = runTask(indexTask).rhs;
Assert.assertEquals(1, segments.size());
DataSegment segment = segments.get(0);
final File segmentFile = segmentCacheManager.getSegmentFiles(segment);
final WindowedStorageAdapter adapter = new WindowedStorageAdapter(new QueryableIndexStorageAdapter(indexIO.loadIndex(segmentFile)), segment.getInterval());
final Sequence<Cursor> cursorSequence = adapter.getAdapter().makeCursors(null, segment.getInterval(), VirtualColumns.EMPTY, Granularities.ALL, false, null);
final List<Map<String, Object>> transforms = cursorSequence.map(cursor -> {
final DimensionSelector selector1 = cursor.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimt", "dimt"));
final DimensionSelector selector2 = cursor.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimtarray1", "dimtarray1"));
final DimensionSelector selector3 = cursor.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimtarray2", "dimtarray2"));
final DimensionSelector selector4 = cursor.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dimtnum_array", "dimtnum_array"));
Map<String, Object> row = new HashMap<>();
row.put("dimt", selector1.defaultGetObject());
row.put("dimtarray1", selector2.defaultGetObject());
row.put("dimtarray2", selector3.defaultGetObject());
row.put("dimtnum_array", selector4.defaultGetObject());
cursor.advance();
return row;
}).toList();
Assert.assertEquals(1, transforms.size());
Assert.assertEquals("bb", transforms.get(0).get("dimt"));
Assert.assertEquals(ImmutableList.of("b", "b"), transforms.get(0).get("dimtarray1"));
Assert.assertEquals(ImmutableList.of("anotherfoo", "arrayfoo"), transforms.get(0).get("dimtarray2"));
Assert.assertEquals(ImmutableList.of("6.0", "7.0"), transforms.get(0).get("dimtnum_array"));
Assert.assertEquals(DATASOURCE, segments.get(0).getDataSource());
Assert.assertEquals(Intervals.of("2014/P1D"), segments.get(0).getInterval());
Assert.assertEquals(NumberedShardSpec.class, segments.get(0).getShardSpec().getClass());
Assert.assertEquals(0, segments.get(0).getShardSpec().getPartitionNum());
}
use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class CsvInputSourceSamplerTest method testCSVColumnAllNull.
@Test
public void testCSVColumnAllNull() {
final TimestampSpec timestampSpec = new TimestampSpec(null, null, DateTimes.of("1970"));
final DimensionsSpec dimensionsSpec = new DimensionsSpec(null);
final DataSchema dataSchema = new DataSchema("sampler", timestampSpec, dimensionsSpec, null, null, null);
final List<String> strCsvRows = ImmutableList.of("FirstName,LastName,Number,Gender", "J,G,,Male", "Kobe,Bryant,,Male", "Lisa, Krystal,,Female", "Michael,Jackson,,Male");
final InputSource inputSource = new InlineInputSource(String.join("\n", strCsvRows));
final InputFormat inputFormat = new CsvInputFormat(null, null, null, true, 0);
final InputSourceSampler inputSourceSampler = new InputSourceSampler();
final SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
Assert.assertEquals(4, response.getNumRowsRead());
Assert.assertEquals(4, response.getNumRowsIndexed());
Assert.assertEquals(4, response.getData().size());
List<SamplerResponseRow> data = response.getData();
Assert.assertEquals(new SamplerResponseRow(new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("Number", null).put("FirstName", "J").put("LastName", "G").put("Gender", "Male").build(), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 0L).put("Number", null).put("FirstName", "J").put("LastName", "G").put("Gender", "Male").build(), null, null), data.get(0));
Assert.assertEquals(new SamplerResponseRow(new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("Number", null).put("FirstName", "Kobe").put("LastName", "Bryant").put("Gender", "Male").build(), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("Number", null).put("__time", 0L).put("FirstName", "Kobe").put("LastName", "Bryant").put("Gender", "Male").build(), null, null), data.get(1));
Assert.assertEquals(new SamplerResponseRow(new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("Number", null).put("FirstName", "Lisa").put("LastName", " Krystal").put("Gender", "Female").build(), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("Number", null).put("__time", 0L).put("FirstName", "Lisa").put("LastName", " Krystal").put("Gender", "Female").build(), null, null), data.get(2));
Assert.assertEquals(new SamplerResponseRow(new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("Number", null).put("FirstName", "Michael").put("LastName", "Jackson").put("Gender", "Male").build(), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 0L).put("Number", null).put("FirstName", "Michael").put("LastName", "Jackson").put("Gender", "Male").build(), null, null), data.get(3));
}
use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class RecordSupplierInputSourceTest method testRead.
@Test
public void testRead() throws IOException {
final RandomCsvSupplier supplier = new RandomCsvSupplier();
final InputSource inputSource = new RecordSupplierInputSource<>("topic", supplier, false);
final List<String> colNames = IntStream.range(0, NUM_COLS).mapToObj(i -> StringUtils.format("col_%d", i)).collect(Collectors.toList());
final InputFormat inputFormat = new CsvInputFormat(colNames, null, null, false, 0);
final InputSourceReader reader = inputSource.reader(new InputRowSchema(new TimestampSpec("col_0", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(colNames.subList(1, colNames.size()))), ColumnsFilter.all()), inputFormat, temporaryFolder.newFolder());
int read = 0;
try (CloseableIterator<InputRow> iterator = reader.read()) {
for (; read < NUM_ROWS && iterator.hasNext(); read++) {
final InputRow inputRow = iterator.next();
Assert.assertEquals(DateTimes.of(TIMESTAMP_STRING), inputRow.getTimestamp());
Assert.assertEquals(NUM_COLS - 1, inputRow.getDimensions().size());
}
}
Assert.assertEquals(NUM_ROWS, read);
Assert.assertTrue(supplier.isClosed());
}
use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class OssInputSourceTest method testReader.
@Test
public void testReader() throws IOException {
EasyMock.reset(OSSCLIENT);
expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)), CONTENT);
expectListObjects(EXPECTED_URIS.get(1), ImmutableList.of(EXPECTED_URIS.get(1)), CONTENT);
expectGetObject(EXPECTED_URIS.get(0));
expectGetObject(EXPECTED_URIS.get(1));
EasyMock.replay(OSSCLIENT);
OssInputSource inputSource = new OssInputSource(OSSCLIENT, INPUT_DATA_CONFIG, null, ImmutableList.of(PREFIXES.get(0), EXPECTED_URIS.get(1)), null, null);
InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), temporaryFolder.newFolder());
CloseableIterator<InputRow> iterator = reader.read();
while (iterator.hasNext()) {
InputRow nextRow = iterator.next();
Assert.assertEquals(NOW, nextRow.getTimestamp());
Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
}
EasyMock.verify(OSSCLIENT);
}
use of org.apache.druid.data.input.impl.CsvInputFormat in project druid by druid-io.
the class GoogleCloudStorageInputSourceTest method testCompressedReader.
@Test
public void testCompressedReader() throws IOException {
EasyMock.reset(STORAGE);
EasyMock.reset(INPUT_DATA_CONFIG);
addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_COMPRESSED_URIS.get(0)));
addExpectedGetCompressedObjectMock(EXPECTED_COMPRESSED_URIS.get(0));
addExpectedPrefixObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_COMPRESSED_URIS.get(1)));
addExpectedGetCompressedObjectMock(EXPECTED_COMPRESSED_URIS.get(1));
EasyMock.expect(INPUT_DATA_CONFIG.getMaxListingLength()).andReturn(MAX_LISTING_LENGTH);
EasyMock.replay(STORAGE);
EasyMock.replay(INPUT_DATA_CONFIG);
GoogleCloudStorageInputSource inputSource = new GoogleCloudStorageInputSource(STORAGE, INPUT_DATA_CONFIG, null, PREFIXES, null);
InputRowSchema someSchema = new InputRowSchema(new TimestampSpec("time", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))), ColumnsFilter.all());
InputSourceReader reader = inputSource.reader(someSchema, new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0), null);
CloseableIterator<InputRow> iterator = reader.read();
while (iterator.hasNext()) {
InputRow nextRow = iterator.next();
Assert.assertEquals(NOW, nextRow.getTimestamp());
Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
}
}
Aggregations