Search in sources :

Example 26 with DataSchema

use of io.druid.segment.indexing.DataSchema in project druid by druid-io.

the class GranularityPathSpecTest method testIntervalTrimming.

@Test
public void testIntervalTrimming() throws Exception {
    UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[] { "testGroup" }));
    HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, Granularities.ALL, ImmutableList.of(new Interval("2015-01-01T11Z/2015-01-02T05Z"))), jsonMapper), new HadoopIOConfig(null, null, null), DEFAULT_TUNING_CONFIG);
    granularityPathSpec.setDataGranularity(Granularities.HOUR);
    granularityPathSpec.setPathFormat("yyyy/MM/dd/HH");
    granularityPathSpec.setFilePattern(".*");
    granularityPathSpec.setInputFormat(TextInputFormat.class);
    Job job = Job.getInstance();
    String formatStr = "file:%s/%s;org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
    createFile(testFolder, "test/2015/01/01/00/file1", "test/2015/01/01/10/file2", "test/2015/01/01/18/file3", "test/2015/01/02/00/file1", "test/2015/01/02/03/file2", "test/2015/01/02/05/file3", "test/2015/01/02/07/file4", "test/2015/01/02/09/file5");
    granularityPathSpec.setInputPath(testFolder.getRoot().getPath() + "/test");
    granularityPathSpec.addInputPaths(HadoopDruidIndexerConfig.fromSpec(spec), job);
    String actual = job.getConfiguration().get("mapreduce.input.multipleinputs.dir.formats");
    String expected = Joiner.on(",").join(Lists.newArrayList(String.format(formatStr, testFolder.getRoot(), "test/2015/01/01/18/file3"), String.format(formatStr, testFolder.getRoot(), "test/2015/01/02/00/file1"), String.format(formatStr, testFolder.getRoot(), "test/2015/01/02/03/file2")));
    Assert.assertEquals("Did not find expected input paths", expected, actual);
}
Also used : HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) Job(org.apache.hadoop.mapreduce.Job) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 27 with DataSchema

use of io.druid.segment.indexing.DataSchema in project druid by druid-io.

the class StaticPathSpecTest method testAddingPaths.

@Test
public void testAddingPaths() throws Exception {
    Job job = new Job();
    StaticPathSpec pathSpec = new StaticPathSpec("/a/c,/a/b/{c,d}", null);
    DataSchema schema = new DataSchema("ds", null, new AggregatorFactory[0], null, jsonMapper);
    HadoopIOConfig io = new HadoopIOConfig(null, null, null);
    pathSpec.addInputPaths(new HadoopDruidIndexerConfig(new HadoopIngestionSpec(schema, io, null)), job);
    String paths = job.getConfiguration().get(MultipleInputs.DIR_FORMATS);
    String formatter = TextInputFormat.class.getName();
    String[] expected = { "/a/c;" + formatter, "/a/b/c;" + formatter, "/a/b/d;" + formatter };
    Assert.assertArrayEquals(expected, paths.split(","));
}
Also used : DataSchema(io.druid.segment.indexing.DataSchema) HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) Job(org.apache.hadoop.mapreduce.Job) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) Test(org.junit.Test)

Example 28 with DataSchema

use of io.druid.segment.indexing.DataSchema in project druid by druid-io.

the class HadoopConverterJobTest method setUp.

@Before
public void setUp() throws Exception {
    final MetadataStorageUpdaterJobSpec metadataStorageUpdaterJobSpec = new MetadataStorageUpdaterJobSpec() {

        @Override
        public String getSegmentTable() {
            return derbyConnectorRule.metadataTablesConfigSupplier().get().getSegmentsTable();
        }

        @Override
        public MetadataStorageConnectorConfig get() {
            return derbyConnectorRule.getMetadataConnectorConfig();
        }
    };
    final File scratchFileDir = temporaryFolder.newFolder();
    storageLocProperty = System.getProperty(STORAGE_PROPERTY_KEY);
    tmpSegmentDir = temporaryFolder.newFolder();
    System.setProperty(STORAGE_PROPERTY_KEY, tmpSegmentDir.getAbsolutePath());
    final URL url = Preconditions.checkNotNull(Query.class.getClassLoader().getResource("druid.sample.tsv"));
    final File tmpInputFile = temporaryFolder.newFile();
    FileUtils.retryCopy(new ByteSource() {

        @Override
        public InputStream openStream() throws IOException {
            return url.openStream();
        }
    }, tmpInputFile, FileUtils.IS_EXCEPTION, 3);
    final HadoopDruidIndexerConfig hadoopDruidIndexerConfig = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema(DATASOURCE, HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(new StringInputRowParser(new DelimitedParseSpec(new TimestampSpec("ts", "iso", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList(TestIndex.DIMENSIONS)), null, null), "\t", "", Arrays.asList(TestIndex.COLUMNS)), null), Map.class), new AggregatorFactory[] { new DoubleSumAggregatorFactory(TestIndex.METRICS[0], TestIndex.METRICS[0]), new HyperUniquesAggregatorFactory("quality_uniques", "quality") }, new UniformGranularitySpec(Granularities.MONTH, Granularities.DAY, ImmutableList.<Interval>of(interval)), HadoopDruidIndexerConfig.JSON_MAPPER), new HadoopIOConfig(ImmutableMap.<String, Object>of("type", "static", "paths", tmpInputFile.getAbsolutePath()), metadataStorageUpdaterJobSpec, tmpSegmentDir.getAbsolutePath()), new HadoopTuningConfig(scratchFileDir.getAbsolutePath(), null, null, null, null, null, false, false, false, false, null, false, false, null, null, null, false, false)));
    metadataStorageTablesConfigSupplier = derbyConnectorRule.metadataTablesConfigSupplier();
    connector = derbyConnectorRule.getConnector();
    try {
        connector.getDBI().withHandle(new HandleCallback<Void>() {

            @Override
            public Void withHandle(Handle handle) throws Exception {
                handle.execute("DROP TABLE druid_segments");
                return null;
            }
        });
    } catch (CallbackFailedException e) {
    // Who cares
    }
    List<Jobby> jobs = ImmutableList.of(new Jobby() {

        @Override
        public boolean run() {
            connector.createSegmentTable(metadataStorageUpdaterJobSpec.getSegmentTable());
            return true;
        }
    }, new HadoopDruidDetermineConfigurationJob(hadoopDruidIndexerConfig), new HadoopDruidIndexerJob(hadoopDruidIndexerConfig, new SQLMetadataStorageUpdaterJobHandler(connector)));
    JobHelper.runJobs(jobs, hadoopDruidIndexerConfig);
}
Also used : HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) HadoopTuningConfig(io.druid.indexer.HadoopTuningConfig) URL(java.net.URL) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) TimestampSpec(io.druid.data.input.impl.TimestampSpec) SQLMetadataStorageUpdaterJobHandler(io.druid.indexer.SQLMetadataStorageUpdaterJobHandler) DoubleSumAggregatorFactory(io.druid.query.aggregation.DoubleSumAggregatorFactory) InputStream(java.io.InputStream) DelimitedParseSpec(io.druid.data.input.impl.DelimitedParseSpec) IOException(java.io.IOException) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) IOException(java.io.IOException) CallbackFailedException(org.skife.jdbi.v2.exceptions.CallbackFailedException) Handle(org.skife.jdbi.v2.Handle) CallbackFailedException(org.skife.jdbi.v2.exceptions.CallbackFailedException) DataSchema(io.druid.segment.indexing.DataSchema) Jobby(io.druid.indexer.Jobby) HadoopDruidIndexerJob(io.druid.indexer.HadoopDruidIndexerJob) StringInputRowParser(io.druid.data.input.impl.StringInputRowParser) HyperUniquesAggregatorFactory(io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) ByteSource(com.google.common.io.ByteSource) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) File(java.io.File) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HadoopDruidDetermineConfigurationJob(io.druid.indexer.HadoopDruidDetermineConfigurationJob) Interval(org.joda.time.Interval) Before(org.junit.Before)

Example 29 with DataSchema

use of io.druid.segment.indexing.DataSchema in project druid by druid-io.

the class HadoopDruidIndexerConfigTest method testHashedBucketSelection.

@Test
public void testHashedBucketSelection() {
    List<HadoopyShardSpec> specs = Lists.newArrayList();
    final int partitionCount = 10;
    for (int i = 0; i < partitionCount; i++) {
        specs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, partitionCount, null, new DefaultObjectMapper()), i));
    }
    HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.MINUTE, Granularities.MINUTE, ImmutableList.of(new Interval("2010-01-01/P1D"))), jsonMapper), new HadoopIOConfig(ImmutableMap.<String, Object>of("paths", "bar", "type", "static"), null, null), new HadoopTuningConfig(null, null, null, ImmutableMap.of(new DateTime("2010-01-01T01:00:00").getMillis(), specs), null, null, false, false, false, false, null, false, false, null, null, null, false, false));
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromSpec(spec);
    final List<String> dims = Arrays.asList("diM1", "dIM2");
    final ImmutableMap<String, Object> values = ImmutableMap.<String, Object>of("Dim1", "1", "DiM2", "2", "dim1", "3", "dim2", "4");
    final long timestamp = new DateTime("2010-01-01T01:00:01").getMillis();
    final Bucket expectedBucket = config.getBucket(new MapBasedInputRow(timestamp, dims, values)).get();
    final long nextBucketTimestamp = Granularities.MINUTE.bucketEnd(new DateTime(timestamp)).getMillis();
    // check that all rows having same set of dims and truncated timestamp hash to same bucket
    for (int i = 0; timestamp + i < nextBucketTimestamp; i++) {
        Assert.assertEquals(expectedBucket.partitionNum, config.getBucket(new MapBasedInputRow(timestamp + i, dims, values)).get().partitionNum);
    }
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) DateTime(org.joda.time.DateTime) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) DefaultObjectMapper(io.druid.jackson.DefaultObjectMapper) MapBasedInputRow(io.druid.data.input.MapBasedInputRow) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 30 with DataSchema

use of io.druid.segment.indexing.DataSchema in project druid by druid-io.

the class HadoopDruidIndexerConfigTest method testNoneShardSpecBucketSelection.

@Test
public void testNoneShardSpecBucketSelection() {
    HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.MINUTE, Granularities.MINUTE, ImmutableList.of(new Interval("2010-01-01/P1D"))), jsonMapper), new HadoopIOConfig(ImmutableMap.<String, Object>of("paths", "bar", "type", "static"), null, null), new HadoopTuningConfig(null, null, null, ImmutableMap.<Long, List<HadoopyShardSpec>>of(new DateTime("2010-01-01T01:00:00").getMillis(), Lists.newArrayList(new HadoopyShardSpec(NoneShardSpec.instance(), 1)), new DateTime("2010-01-01T02:00:00").getMillis(), Lists.newArrayList(new HadoopyShardSpec(NoneShardSpec.instance(), 2))), null, null, false, false, false, false, null, false, false, null, null, null, false, false));
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromSpec(spec);
    final List<String> dims = Arrays.asList("diM1", "dIM2");
    final ImmutableMap<String, Object> values = ImmutableMap.<String, Object>of("Dim1", "1", "DiM2", "2", "dim1", "3", "dim2", "4");
    final long ts1 = new DateTime("2010-01-01T01:00:01").getMillis();
    Assert.assertEquals(config.getBucket(new MapBasedInputRow(ts1, dims, values)).get().getShardNum(), 1);
    final long ts2 = new DateTime("2010-01-01T02:00:01").getMillis();
    Assert.assertEquals(config.getBucket(new MapBasedInputRow(ts2, dims, values)).get().getShardNum(), 2);
}
Also used : AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) DateTime(org.joda.time.DateTime) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) MapBasedInputRow(io.druid.data.input.MapBasedInputRow) Interval(org.joda.time.Interval) Test(org.junit.Test)

Aggregations

DataSchema (io.druid.segment.indexing.DataSchema)34 UniformGranularitySpec (io.druid.segment.indexing.granularity.UniformGranularitySpec)29 Interval (org.joda.time.Interval)18 Test (org.junit.Test)18 RealtimeTuningConfig (io.druid.segment.indexing.RealtimeTuningConfig)12 File (java.io.File)11 DimensionsSpec (io.druid.data.input.impl.DimensionsSpec)10 TimestampSpec (io.druid.data.input.impl.TimestampSpec)10 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)10 LongSumAggregatorFactory (io.druid.query.aggregation.LongSumAggregatorFactory)9 DefaultObjectMapper (io.druid.jackson.DefaultObjectMapper)8 RealtimeIOConfig (io.druid.segment.indexing.RealtimeIOConfig)8 StringInputRowParser (io.druid.data.input.impl.StringInputRowParser)7 CountAggregatorFactory (io.druid.query.aggregation.CountAggregatorFactory)7 DoubleSumAggregatorFactory (io.druid.query.aggregation.DoubleSumAggregatorFactory)7 Before (org.junit.Before)7 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)6 ImmutableMap (com.google.common.collect.ImmutableMap)6 FireDepartment (io.druid.segment.realtime.FireDepartment)6 Period (org.joda.time.Period)6