use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class DruidParquetReadSupport method getPartialReadSchema.
private MessageType getPartialReadSchema(InitContext context) {
MessageType fullSchema = context.getFileSchema();
String name = fullSchema.getName();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
String tsField = config.getParser().getParseSpec().getTimestampSpec().getTimestampColumn();
List<DimensionSchema> dimensionSchema = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
Set<String> dimensions = Sets.newHashSet();
for (DimensionSchema dim : dimensionSchema) {
dimensions.add(dim.getName());
}
Set<String> metricsFields = Sets.newHashSet();
for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
metricsFields.addAll(agg.requiredFields());
}
List<Type> partialFields = Lists.newArrayList();
for (Type type : fullSchema.getFields()) {
if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
partialFields.add(type);
}
}
return new MessageType(name, partialFields);
}
use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class DatasourcePathSpecTest method testAddInputPaths.
@Test
public void testAddInputPaths() throws Exception {
HadoopDruidIndexerConfig hadoopIndexerConfig = makeHadoopDruidIndexerConfig();
ObjectMapper mapper = new DefaultObjectMapper();
DatasourcePathSpec pathSpec = new DatasourcePathSpec(mapper, segments, ingestionSpec, null);
Configuration config = new Configuration();
Job job = EasyMock.createNiceMock(Job.class);
EasyMock.expect(job.getConfiguration()).andReturn(config).anyTimes();
EasyMock.replay(job);
pathSpec.addInputPaths(hadoopIndexerConfig, job);
List<WindowedDataSegment> actualSegments = mapper.readValue(config.get(DatasourceInputFormat.CONF_INPUT_SEGMENTS), new TypeReference<List<WindowedDataSegment>>() {
});
Assert.assertEquals(segments, actualSegments);
DatasourceIngestionSpec actualIngestionSpec = mapper.readValue(config.get(DatasourceInputFormat.CONF_DRUID_SCHEMA), DatasourceIngestionSpec.class);
Assert.assertEquals(ingestionSpec.withDimensions(ImmutableList.of("product")).withMetrics(ImmutableList.of("visited_sum")), actualIngestionSpec);
}
use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class DruidParquetInputTest method testBinaryAsString.
@Test
public void testBinaryAsString() throws IOException, InterruptedException {
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromFile(new File("example/impala_hadoop_parquet_job.json"));
Job job = Job.getInstance(new Configuration());
config.intoConfiguration(job);
GenericRecord data = getFirstRecord(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
InputRow row = config.getParser().parse(data);
// without binaryAsString: true, the value would something like "[104, 101, 121, 32, 116, 104, 105, 115, 32, 105, 115, 3.... ]"
assertEquals(row.getDimension("field").get(0), "hey this is &é(-è_çà)=^$ù*! Ω^^");
assertEquals(row.getTimestampFromEpoch(), 1471800234);
}
use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class DruidParquetInputTest method test.
@Test
public void test() throws IOException, InterruptedException {
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromFile(new File("example/wikipedia_hadoop_parquet_job.json"));
Job job = Job.getInstance(new Configuration());
config.intoConfiguration(job);
GenericRecord data = getFirstRecord(job, "example/wikipedia_list.parquet");
// field not read, should return null
assertEquals(data.get("added"), null);
assertEquals(data.get("page"), new Utf8("Gypsy Danger"));
assertEquals(config.getParser().parse(data).getDimension("page").get(0), "Gypsy Danger");
}
use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class OrcIndexGeneratorJobTest method setUp.
@Before
public void setUp() throws Exception {
mapper = HadoopDruidIndexerConfig.JSON_MAPPER;
mapper.registerSubtypes(new NamedType(HashBasedNumberedShardSpec.class, "hashed"));
dataRoot = temporaryFolder.newFolder("data");
outputRoot = temporaryFolder.newFolder("output");
File dataFile = writeDataToLocalOrcFile(dataRoot, data);
HashMap<String, Object> inputSpec = new HashMap<String, Object>();
inputSpec.put("paths", dataFile.getCanonicalPath());
inputSpec.put("type", "static");
inputSpec.put("inputFormat", "org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat");
config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema(dataSourceName, mapper.convertValue(inputRowParser, Map.class), aggs, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(this.interval)), mapper), new HadoopIOConfig(ImmutableMap.copyOf(inputSpec), null, outputRoot.getCanonicalPath()), new HadoopTuningConfig(outputRoot.getCanonicalPath(), null, null, null, null, null, false, false, false, false, //verifies that set num reducers is ignored
ImmutableMap.of(JobContext.NUM_REDUCES, "0"), false, true, null, true, null, false, false)));
config.setShardSpecs(loadShardSpecs(shardInfoForEachSegment));
config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
}
Aggregations