Search in sources :

Example 1 with Job

use of org.apache.hadoop.mapreduce.Job in project druid by druid-io.

the class IndexGeneratorJob method run.

public boolean run() {
    try {
        Job job = Job.getInstance(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals()));
        job.getConfiguration().set("io.sort.record.percent", "0.23");
        JobHelper.injectSystemProperties(job);
        config.addJobProperties(job);
        job.setMapperClass(IndexGeneratorMapper.class);
        job.setMapOutputValueClass(BytesWritable.class);
        SortableBytes.useSortableBytesAsMapOutputKey(job);
        int numReducers = Iterables.size(config.getAllBuckets().get());
        if (numReducers == 0) {
            throw new RuntimeException("No buckets?? seems there is no data to index.");
        }
        if (config.getSchema().getTuningConfig().getUseCombiner()) {
            job.setCombinerClass(IndexGeneratorCombiner.class);
            job.setCombinerKeyGroupingComparatorClass(BytesWritable.Comparator.class);
        }
        job.setNumReduceTasks(numReducers);
        job.setPartitionerClass(IndexGeneratorPartitioner.class);
        setReducerClass(job);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(IndexGeneratorOutputFormat.class);
        FileOutputFormat.setOutputPath(job, config.makeIntermediatePath());
        config.addInputPaths(job);
        config.intoConfiguration(job);
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), job);
        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
        boolean success = job.waitForCompletion(true);
        Counter invalidRowCount = job.getCounters().findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER);
        jobStats.setInvalidRowCount(invalidRowCount.getValue());
        return success;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Counter(org.apache.hadoop.mapreduce.Counter) Configuration(org.apache.hadoop.conf.Configuration) BytesWritable(org.apache.hadoop.io.BytesWritable) Job(org.apache.hadoop.mapreduce.Job) TimeoutException(java.util.concurrent.TimeoutException) InvalidJobConfException(org.apache.hadoop.mapred.InvalidJobConfException) FileNotFoundException(java.io.FileNotFoundException) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException)

Example 2 with Job

use of org.apache.hadoop.mapreduce.Job in project druid by druid-io.

the class DeterminePartitionsJob method run.

public boolean run() {
    try {
        if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
            throw new ISE("DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec());
        }
        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));
            JobHelper.injectSystemProperties(groupByJob);
            config.addJobProperties(groupByJob);
            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }
        /*
       * Read grouped data and determine appropriate partitions.
       */
        final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));
        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");
        JobHelper.injectSystemProperties(dimSelectionJob);
        config.addJobProperties(dimSelectionJob);
        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            config.addInputPaths(dimSelectionJob);
        }
        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
        dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);
        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());
        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());
        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }
        /*
       * Load partitions determined by the previous job.
       */
        log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<Long, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
                List<ShardSpec> specs = config.JSON_MAPPER.readValue(Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() {
                });
                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
                }
                shardSpecs.put(segmentGranularity.getStartMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SingleDimensionPartitionsSpec(io.druid.indexer.partitions.SingleDimensionPartitionsSpec) SingleDimensionShardSpec(io.druid.timeline.partition.SingleDimensionShardSpec) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) InvalidJobConfException(org.apache.hadoop.mapred.InvalidJobConfException) IOException(java.io.IOException) FileSystem(org.apache.hadoop.fs.FileSystem) ISE(io.druid.java.util.common.ISE) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) Interval(org.joda.time.Interval)

Example 3 with Job

use of org.apache.hadoop.mapreduce.Job in project hbase by apache.

the class TestTableSnapshotInputFormat method doTestWithMapReduce.

// this is also called by the IntegrationTestTableSnapshotInputFormat
public static void doTestWithMapReduce(HBaseTestingUtility util, TableName tableName, String snapshotName, byte[] startRow, byte[] endRow, Path tableDir, int numRegions, int expectedNumSplits, boolean shutdownCluster) throws Exception {
    //create the table and snapshot
    createTableAndSnapshot(util, tableName, snapshotName, startRow, endRow, numRegions);
    if (shutdownCluster) {
        util.shutdownMiniHBaseCluster();
    }
    try {
        // create the job
        Job job = new Job(util.getConfiguration());
        // limit the scan
        Scan scan = new Scan(startRow, endRow);
        job.setJarByClass(util.getClass());
        TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(), TestTableSnapshotInputFormat.class);
        TableMapReduceUtil.initTableSnapshotMapperJob(snapshotName, scan, TestTableSnapshotMapper.class, ImmutableBytesWritable.class, NullWritable.class, job, true, tableDir);
        job.setReducerClass(TestTableSnapshotInputFormat.TestTableSnapshotReducer.class);
        job.setNumReduceTasks(1);
        job.setOutputFormatClass(NullOutputFormat.class);
        Assert.assertTrue(job.waitForCompletion(true));
    } finally {
        if (!shutdownCluster) {
            util.getAdmin().deleteSnapshot(snapshotName);
            util.deleteTable(tableName);
        }
    }
}
Also used : Scan(org.apache.hadoop.hbase.client.Scan) Job(org.apache.hadoop.mapreduce.Job)

Example 4 with Job

use of org.apache.hadoop.mapreduce.Job in project hbase by apache.

the class TestTableInputFormatScanBase method testNumOfSplits.

/**
   * Tests a MR scan using data skew auto-balance
   *
   * @throws IOException
   * @throws ClassNotFoundException
   * @throws InterruptedException
   */
public void testNumOfSplits(String ratio, int expectedNumOfSplits) throws IOException, InterruptedException, ClassNotFoundException {
    String jobName = "TestJobForNumOfSplits";
    LOG.info("Before map/reduce startup - job " + jobName);
    Configuration c = new Configuration(TEST_UTIL.getConfiguration());
    Scan scan = new Scan();
    scan.addFamily(INPUT_FAMILYS[0]);
    scan.addFamily(INPUT_FAMILYS[1]);
    c.set("hbase.mapreduce.input.autobalance", "true");
    c.set("hbase.mapreduce.input.autobalance.maxskewratio", ratio);
    c.set(KEY_STARTROW, "");
    c.set(KEY_LASTROW, "");
    Job job = new Job(c, jobName);
    TableMapReduceUtil.initTableMapperJob(TABLE_NAME.getNameAsString(), scan, ScanMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
    TableInputFormat tif = new TableInputFormat();
    tif.setConf(job.getConfiguration());
    Assert.assertEquals(TABLE_NAME, table.getName());
    List<InputSplit> splits = tif.getSplits(job);
    Assert.assertEquals(expectedNumOfSplits, splits.size());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Scan(org.apache.hadoop.hbase.client.Scan) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 5 with Job

use of org.apache.hadoop.mapreduce.Job in project hbase by apache.

the class TestHFileOutputFormat2 method runIncrementalPELoad.

private void runIncrementalPELoad(Configuration conf, HTableDescriptor tableDescriptor, RegionLocator regionLocator, Path outDir, boolean putSortReducer) throws IOException, UnsupportedEncodingException, InterruptedException, ClassNotFoundException {
    Job job = new Job(conf, "testLocalMRIncrementalLoad");
    job.setWorkingDirectory(util.getDataTestDirOnTestFS("runIncrementalPELoad"));
    job.getConfiguration().setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName(), KeyValueSerialization.class.getName());
    setupRandomGeneratorMapper(job, putSortReducer);
    HFileOutputFormat2.configureIncrementalLoad(job, tableDescriptor, regionLocator);
    FileOutputFormat.setOutputPath(job, outDir);
    assertFalse(util.getTestFileSystem().exists(outDir));
    assertEquals(regionLocator.getAllRegionLocations().size(), job.getNumReduceTasks());
    assertTrue(job.waitForCompletion(true));
}
Also used : Job(org.apache.hadoop.mapreduce.Job)

Aggregations

Job (org.apache.hadoop.mapreduce.Job)886 Path (org.apache.hadoop.fs.Path)498 Configuration (org.apache.hadoop.conf.Configuration)434 Test (org.junit.Test)259 IOException (java.io.IOException)135 FileSystem (org.apache.hadoop.fs.FileSystem)128 File (java.io.File)77 InputSplit (org.apache.hadoop.mapreduce.InputSplit)58 ArrayList (java.util.ArrayList)55 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)55 Scan (org.apache.hadoop.hbase.client.Scan)45 FileStatus (org.apache.hadoop.fs.FileStatus)44 NutchJob (org.apache.nutch.util.NutchJob)43 JobConf (org.apache.hadoop.mapred.JobConf)42 Text (org.apache.hadoop.io.Text)39 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)36 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)35 JobContext (org.apache.hadoop.mapreduce.JobContext)35 GenericOptionsParser (org.apache.hadoop.util.GenericOptionsParser)35 CommandLine (org.apache.commons.cli.CommandLine)33