Search in sources :

Example 1 with HadoopDruidIndexerConfig

use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class DruidParquetReadSupport method getPartialReadSchema.

private MessageType getPartialReadSchema(InitContext context) {
    MessageType fullSchema = context.getFileSchema();
    String name = fullSchema.getName();
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
    String tsField = config.getParser().getParseSpec().getTimestampSpec().getTimestampColumn();
    List<DimensionSchema> dimensionSchema = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
    Set<String> dimensions = Sets.newHashSet();
    for (DimensionSchema dim : dimensionSchema) {
        dimensions.add(dim.getName());
    }
    Set<String> metricsFields = Sets.newHashSet();
    for (AggregatorFactory agg : config.getSchema().getDataSchema().getAggregators()) {
        metricsFields.addAll(agg.requiredFields());
    }
    List<Type> partialFields = Lists.newArrayList();
    for (Type type : fullSchema.getFields()) {
        if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) || dimensions.size() > 0 && dimensions.contains(type.getName()) || dimensions.size() == 0) {
            partialFields.add(type);
        }
    }
    return new MessageType(name, partialFields);
}
Also used : MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) DimensionSchema(io.druid.data.input.impl.DimensionSchema) MessageType(org.apache.parquet.schema.MessageType)

Example 2 with HadoopDruidIndexerConfig

use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class DatasourcePathSpecTest method testAddInputPaths.

@Test
public void testAddInputPaths() throws Exception {
    HadoopDruidIndexerConfig hadoopIndexerConfig = makeHadoopDruidIndexerConfig();
    ObjectMapper mapper = new DefaultObjectMapper();
    DatasourcePathSpec pathSpec = new DatasourcePathSpec(mapper, segments, ingestionSpec, null);
    Configuration config = new Configuration();
    Job job = EasyMock.createNiceMock(Job.class);
    EasyMock.expect(job.getConfiguration()).andReturn(config).anyTimes();
    EasyMock.replay(job);
    pathSpec.addInputPaths(hadoopIndexerConfig, job);
    List<WindowedDataSegment> actualSegments = mapper.readValue(config.get(DatasourceInputFormat.CONF_INPUT_SEGMENTS), new TypeReference<List<WindowedDataSegment>>() {
    });
    Assert.assertEquals(segments, actualSegments);
    DatasourceIngestionSpec actualIngestionSpec = mapper.readValue(config.get(DatasourceInputFormat.CONF_DRUID_SCHEMA), DatasourceIngestionSpec.class);
    Assert.assertEquals(ingestionSpec.withDimensions(ImmutableList.of("product")).withMetrics(ImmutableList.of("visited_sum")), actualIngestionSpec);
}
Also used : DatasourceIngestionSpec(io.druid.indexer.hadoop.DatasourceIngestionSpec) WindowedDataSegment(io.druid.indexer.hadoop.WindowedDataSegment) Configuration(org.apache.hadoop.conf.Configuration) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) DefaultObjectMapper(io.druid.jackson.DefaultObjectMapper) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) DefaultObjectMapper(io.druid.jackson.DefaultObjectMapper) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Test(org.junit.Test)

Example 3 with HadoopDruidIndexerConfig

use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class DruidParquetInputTest method testBinaryAsString.

@Test
public void testBinaryAsString() throws IOException, InterruptedException {
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromFile(new File("example/impala_hadoop_parquet_job.json"));
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    GenericRecord data = getFirstRecord(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
    InputRow row = config.getParser().parse(data);
    // without binaryAsString: true, the value would something like "[104, 101, 121, 32, 116, 104, 105, 115, 32, 105, 115, 3.... ]"
    assertEquals(row.getDimension("field").get(0), "hey this is &é(-è_çà)=^$ù*! Ω^^");
    assertEquals(row.getTimestampFromEpoch(), 1471800234);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) InputRow(io.druid.data.input.InputRow) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Test(org.junit.Test)

Example 4 with HadoopDruidIndexerConfig

use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class DruidParquetInputTest method test.

@Test
public void test() throws IOException, InterruptedException {
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromFile(new File("example/wikipedia_hadoop_parquet_job.json"));
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    GenericRecord data = getFirstRecord(job, "example/wikipedia_list.parquet");
    // field not read, should return null
    assertEquals(data.get("added"), null);
    assertEquals(data.get("page"), new Utf8("Gypsy Danger"));
    assertEquals(config.getParser().parse(data).getDimension("page").get(0), "Gypsy Danger");
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Utf8(org.apache.avro.util.Utf8) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Test(org.junit.Test)

Example 5 with HadoopDruidIndexerConfig

use of io.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class OrcIndexGeneratorJobTest method setUp.

@Before
public void setUp() throws Exception {
    mapper = HadoopDruidIndexerConfig.JSON_MAPPER;
    mapper.registerSubtypes(new NamedType(HashBasedNumberedShardSpec.class, "hashed"));
    dataRoot = temporaryFolder.newFolder("data");
    outputRoot = temporaryFolder.newFolder("output");
    File dataFile = writeDataToLocalOrcFile(dataRoot, data);
    HashMap<String, Object> inputSpec = new HashMap<String, Object>();
    inputSpec.put("paths", dataFile.getCanonicalPath());
    inputSpec.put("type", "static");
    inputSpec.put("inputFormat", "org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat");
    config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema(dataSourceName, mapper.convertValue(inputRowParser, Map.class), aggs, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(this.interval)), mapper), new HadoopIOConfig(ImmutableMap.copyOf(inputSpec), null, outputRoot.getCanonicalPath()), new HadoopTuningConfig(outputRoot.getCanonicalPath(), null, null, null, null, null, false, false, false, false, //verifies that set num reducers is ignored
    ImmutableMap.of(JobContext.NUM_REDUCES, "0"), false, true, null, true, null, false, false)));
    config.setShardSpecs(loadShardSpecs(shardInfoForEachSegment));
    config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) HashMap(java.util.HashMap) NamedType(com.fasterxml.jackson.databind.jsontype.NamedType) HadoopTuningConfig(io.druid.indexer.HadoopTuningConfig) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) OrcFile(org.apache.orc.OrcFile) File(java.io.File) Before(org.junit.Before)

Aggregations

HadoopDruidIndexerConfig (io.druid.indexer.HadoopDruidIndexerConfig)8 Job (org.apache.hadoop.mapreduce.Job)5 Test (org.junit.Test)5 File (java.io.File)4 Configuration (org.apache.hadoop.conf.Configuration)4 HadoopIOConfig (io.druid.indexer.HadoopIOConfig)3 HadoopIngestionSpec (io.druid.indexer.HadoopIngestionSpec)3 DataSchema (io.druid.segment.indexing.DataSchema)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 HadoopTuningConfig (io.druid.indexer.HadoopTuningConfig)2 DefaultObjectMapper (io.druid.jackson.DefaultObjectMapper)2 UniformGranularitySpec (io.druid.segment.indexing.granularity.UniformGranularitySpec)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Before (org.junit.Before)2 NamedType (com.fasterxml.jackson.databind.jsontype.NamedType)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ByteSource (com.google.common.io.ByteSource)1 InputRow (io.druid.data.input.InputRow)1 DelimitedParseSpec (io.druid.data.input.impl.DelimitedParseSpec)1