use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class FlattenSpecParquetInputTest method testFlat1Autodiscover.
@Test
public void testFlat1Autodiscover() throws IOException, InterruptedException {
HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/flattening/flat_1_autodiscover_fields.json", parserType, true);
config.intoConfiguration(job);
Object data = getFirstRow(job, parserType, ((StaticPathSpec) config.getPathSpec()).getPaths());
List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
Assert.assertEquals(TS1, rows.get(0).getTimestamp().toString());
Assert.assertEquals("d1v1", rows.get(0).getDimension("dim1").get(0));
Assert.assertEquals("d2v1", rows.get(0).getDimension("dim2").get(0));
Assert.assertEquals("1", rows.get(0).getDimension("dim3").get(0));
Assert.assertEquals("listDim1v1", rows.get(0).getDimension("listDim").get(0));
Assert.assertEquals("listDim1v2", rows.get(0).getDimension("listDim").get(1));
Assert.assertEquals(1, rows.get(0).getMetric("metric1").longValue());
}
use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class FlattenSpecParquetInputTest method testFlat1NoFlattenSpec.
@Test
public void testFlat1NoFlattenSpec() throws IOException, InterruptedException {
HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/flattening/flat_1.json", parserType, false);
config.intoConfiguration(job);
Object data = getFirstRow(job, parserType, ((StaticPathSpec) config.getPathSpec()).getPaths());
List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
Assert.assertEquals(TS1, rows.get(0).getTimestamp().toString());
Assert.assertEquals("d1v1", rows.get(0).getDimension("dim1").get(0));
Assert.assertEquals("d2v1", rows.get(0).getDimension("dim2").get(0));
Assert.assertEquals("1", rows.get(0).getDimension("dim3").get(0));
Assert.assertEquals("listDim1v1", rows.get(0).getDimension("listDim").get(0));
Assert.assertEquals("listDim1v2", rows.get(0).getDimension("listDim").get(1));
Assert.assertEquals(1, rows.get(0).getMetric("metric1").longValue());
}
use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class DruidParquetAvroReadSupport method getPartialReadSchema.
/**
* Select the columns from the parquet schema that are used in the schema of the ingestion job
*
* @param context The context of the file to be read
*
* @return the partial schema that only contains the columns that are being used in the schema
*/
private MessageType getPartialReadSchema(InitContext context) {
MessageType fullSchema = context.getFileSchema();
String name = fullSchema.getName();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
ParseSpec parseSpec = config.getParser().getParseSpec();
if (parseSpec instanceof AvroParseSpec) {
if (((AvroParseSpec) parseSpec).getFlattenSpec() != null) {
return fullSchema;
}
}
String tsField = config.getParser().getParseSpec().getTimestampSpec().getTimestampColumn();
List<DimensionSchema> dimensionSchema = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
Set<String> dimensions = new HashSet<>();
for (DimensionSchema dim : dimensionSchema) {
dimensions.add(dim.getName());
}
Set<String> metricsFields = new HashSet<>();
for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
metricsFields.addAll(agg.requiredFields());
}
List<Type> partialFields = new ArrayList<>();
for (Type type : fullSchema.getFields()) {
if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
partialFields.add(type);
}
}
return new MessageType(name, partialFields);
}
use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class DruidParquetReadSupport method getPartialReadSchema.
/**
* Select the columns from the parquet schema that are used in the schema of the ingestion job
*
* @param context The context of the file to be read
*
* @return the partial schema that only contains the columns that are being used in the schema
*/
private MessageType getPartialReadSchema(InitContext context) {
MessageType fullSchema = context.getFileSchema();
String name = fullSchema.getName();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
ParseSpec parseSpec = config.getParser().getParseSpec();
// parse the flatten spec and determine it isn't auto discovering props?
if (parseSpec instanceof ParquetParseSpec) {
if (((ParquetParseSpec) parseSpec).getFlattenSpec() != null) {
return fullSchema;
}
}
String tsField = parseSpec.getTimestampSpec().getTimestampColumn();
List<DimensionSchema> dimensionSchema = parseSpec.getDimensionsSpec().getDimensions();
Set<String> dimensions = new HashSet<>();
for (DimensionSchema dim : dimensionSchema) {
dimensions.add(dim.getName());
}
Set<String> metricsFields = new HashSet<>();
for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
metricsFields.addAll(agg.requiredFields());
}
List<Type> partialFields = new ArrayList<>();
for (Type type : fullSchema.getFields()) {
if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
partialFields.add(type);
}
}
return new MessageType(name, partialFields);
}
use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class OrcHadoopInputRowParserTest method testDate1900.
@Test
public void testDate1900() throws IOException {
/*
TestOrcFile.testDate1900.orc
struct<time:timestamp,date:date>
{1900-05-05 12:34:56.1, 1900-12-25}
*/
HadoopDruidIndexerConfig config = loadHadoopDruidIndexerConfig("example/testDate1900_hadoop_job.json");
Job job = Job.getInstance(new Configuration());
config.intoConfiguration(job);
OrcStruct data = getFirstRow(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
Assert.assertEquals(1, rows.get(0).getDimensions().size());
Assert.assertEquals("1900-12-25T00:00:00.000Z", rows.get(0).getDimension("date").get(0));
Assert.assertEquals(DateTimes.of("1900-05-05T12:34:56.1Z"), rows.get(0).getTimestamp());
}
Aggregations