Examples with Job - org.apache.hadoop.mapreduce.Job

Example 16 with Job

use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.

the class UpdateWeights method run.

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();
    String statsPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.StatsPath", conf);
    String scoresPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.ScoresPath", conf);
    String exampleType = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.ExampleType", conf).toLowerCase();
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.OutputPath", conf);
    sLogger.info("Tool name: UpdateWeights");
    sLogger.info(" - Stats path: " + statsPath);
    sLogger.info(" - Scores path: " + scoresPath);
    sLogger.info(" - Example type: " + exampleType);
    sLogger.info(" - Output path: " + outputPath);
    Job job = new Job(conf);
    job.setJobName("UpdateWeights");
    job.setJarByClass(UpdateWeights.class);
    FileInputFormat.addInputPath(job, new Path(statsPath));
    FileInputFormat.addInputPath(job, new Path(scoresPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(ContextPatternWritable.class);
    if ("pattern".equals(exampleType)) {
        job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
    } else if ("context".equals(exampleType)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else {
        throw new RuntimeException("Invalid ExampleType in UpdateExampleWeight -- " + exampleType);
    }
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);
    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.waitForCompletion(true);
    return 0;
}

Also used : Path(org.apache.hadoop.fs.Path) ContextPatternWritable(edu.isi.mavuno.util.ContextPatternWritable) Configuration(org.apache.hadoop.conf.Configuration) Job(org.apache.hadoop.mapreduce.Job)

Example 17 with Job

use of org.apache.hadoop.mapreduce.Job in project mavuno by metzlerd.

the class HarvestParseGraph method run.

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.OutputPath", conf);
    sLogger.info("Tool name: HarvestParseGraph");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);
    Job job = new Job(conf);
    job.setJobName("HarvestParseGraph");
    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.waitForCompletion(true);
    return 0;
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Job(org.apache.hadoop.mapreduce.Job)

Example 18 with Job

use of org.apache.hadoop.mapreduce.Job in project pinot by linkedin.

the class BackfillPhaseJob method run.

public Job run() throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJarByClass(BackfillPhaseJob.class);
    job.setJobName(name);
    FileSystem fs = FileSystem.get(getConf());
    Configuration configuration = job.getConfiguration();
    LOGGER.info("*******************************************************************************");
    String controllerHost = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_HOST);
    String controllerPort = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_PORT);
    LOGGER.info("Controller Host : {} Controller Port : {}", controllerHost, controllerPort);
    String segmentStartTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_START_TIME);
    String segmentEndTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_END_TIME);
    long startTime = Long.valueOf(segmentStartTime);
    long endTime = Long.valueOf(segmentEndTime);
    if (Long.valueOf(segmentStartTime) > Long.valueOf(segmentEndTime)) {
        throw new IllegalStateException("Start time cannot be greater than end time");
    }
    String tableName = getAndSetConfiguration(configuration, BACKFILL_PHASE_TABLE_NAME);
    LOGGER.info("Start time : {} End time : {} Table name : {}", segmentStartTime, segmentEndTime, tableName);
    String outputPath = getAndSetConfiguration(configuration, BACKFILL_PHASE_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputPath);
    Path backfillDir = new Path(outputPath);
    if (fs.exists(backfillDir)) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(backfillDir, true);
    }
    Path downloadDir = new Path(backfillDir, DOWNLOAD);
    LOGGER.info("Creating download dir : {}", downloadDir);
    fs.mkdirs(downloadDir);
    Path inputDir = new Path(backfillDir, INPUT);
    LOGGER.info("Creating input dir : {}", inputDir);
    fs.mkdirs(inputDir);
    Path outputDir = new Path(backfillDir, OUTPUT);
    LOGGER.info("Creating output dir : {}", outputDir);
    BackfillControllerAPIs backfillControllerAPIs = new BackfillControllerAPIs(controllerHost, Integer.valueOf(controllerPort), tableName);
    LOGGER.info("Downloading segments in range {} to {}", startTime, endTime);
    List<String> allSegments = backfillControllerAPIs.getAllSegments(tableName);
    List<String> segmentsToDownload = backfillControllerAPIs.findSegmentsInRange(tableName, allSegments, startTime, endTime);
    for (String segmentName : segmentsToDownload) {
        backfillControllerAPIs.downloadSegment(segmentName, downloadDir);
    }
    LOGGER.info("Reading downloaded segment input files");
    List<FileStatus> inputDataFiles = new ArrayList<>();
    inputDataFiles.addAll(Lists.newArrayList(fs.listStatus(downloadDir)));
    LOGGER.info("size {}", inputDataFiles.size());
    try {
        LOGGER.info("Creating input files at {} for segment input files", inputDir);
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((inputDir + "/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }
    job.setMapperClass(BackfillPhaseMapJob.BackfillMapper.class);
    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }
    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }
    LOGGER.info("Cleanup the working directory");
    LOGGER.info("Deleting the dir: {}", downloadDir);
    fs.delete(downloadDir, true);
    LOGGER.info("Deleting the dir: {}", inputDir);
    fs.delete(inputDir, true);
    LOGGER.info("Deleting the dir: {}", outputDir);
    fs.delete(outputDir, true);
    return job;
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Job(org.apache.hadoop.mapreduce.Job)

Example 19 with Job

use of org.apache.hadoop.mapreduce.Job in project pinot by linkedin.

the class DerivedColumnTransformationPhaseJob method run.

public Job run() throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName(name);
    job.setJarByClass(DerivedColumnTransformationPhaseJob.class);
    Configuration configuration = job.getConfiguration();
    FileSystem fs = FileSystem.get(configuration);
    // Input Path
    String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH);
    LOGGER.info("Input path dir: " + inputPathDir);
    for (String inputPath : inputPathDir.split(",")) {
        LOGGER.info("Adding input:" + inputPath);
        Path input = new Path(inputPath);
        FileInputFormat.addInputPath(job, input);
    }
    // Topk path
    String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH);
    LOGGER.info("Topk path : " + topkPath);
    // Output path
    Path outputPath = new Path(getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH));
    LOGGER.info("Output path dir: " + outputPath.toString());
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);
    }
    FileOutputFormat.setOutputPath(job, outputPath);
    // Schema
    Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
    LOGGER.info("Schema : {}", avroSchema.toString(true));
    // ThirdEyeConfig
    String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
    LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode());
    // New schema
    Schema outputSchema = newSchema(thirdeyeConfig);
    job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString());
    // Map config
    job.setMapperClass(DerivedColumnTransformationPhaseMapper.class);
    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(NullWritable.class);
    AvroJob.setOutputKeySchema(job, outputSchema);
    LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class);
    AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema);
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    return job;
}

Also used : Path(org.apache.hadoop.fs.Path) ThirdEyeConfig(com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) Schema(org.apache.avro.Schema) Job(org.apache.hadoop.mapreduce.Job) AvroJob(org.apache.avro.mapreduce.AvroJob)

Example 20 with Job

use of org.apache.hadoop.mapreduce.Job in project hadoop by apache.

the class TestCombineFileInputFormat method testSplitPlacement.

/**
   * The test suppresses unchecked warnings in
   * {@link org.mockito.Mockito#reset}. Although calling the method is
   * a bad manner, we call the method instead of splitting the test
   * (i.e. restarting MiniDFSCluster) to save time.
   */
@Test
@SuppressWarnings("unchecked")
public void testSplitPlacement() throws Exception {
    MiniDFSCluster dfs = null;
    FileSystem fileSys = null;
    try {
        /* Start 3 datanodes, one each in rack r1, r2, r3. Create five files
       * 1) file1 and file5, just after starting the datanode on r1, with 
       *    a repl factor of 1, and,
       * 2) file2, just after starting the datanode on r2, with 
       *    a repl factor of 2, and,
       * 3) file3, file4 after starting the all three datanodes, with a repl 
       *    factor of 3.
       * At the end, file1, file5 will be present on only datanode1, file2 will 
       * be present on datanode 1 and datanode2 and 
       * file3, file4 will be present on all datanodes. 
       */
        Configuration conf = new Configuration();
        conf.setBoolean("dfs.replication.considerLoad", false);
        dfs = new MiniDFSCluster.Builder(conf).racks(rack1).hosts(hosts1).build();
        dfs.waitActive();
        fileSys = dfs.getFileSystem();
        if (!fileSys.mkdirs(inDir)) {
            throw new IOException("Mkdirs failed to create " + inDir.toString());
        }
        Path file1 = new Path(dir1 + "/file1");
        writeFile(conf, file1, (short) 1, 1);
        // create another file on the same datanode
        Path file5 = new Path(dir5 + "/file5");
        writeFile(conf, file5, (short) 1, 1);
        // split it using a CombinedFile input format
        DummyInputFormat inFormat = new DummyInputFormat();
        Job job = Job.getInstance(conf);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir5);
        List<InputSplit> splits = inFormat.getSplits(job);
        System.out.println("Made splits(Test0): " + splits.size());
        for (InputSplit split : splits) {
            System.out.println("File split(Test0): " + split);
        }
        assertEquals(1, splits.size());
        CombineFileSplit fileSplit = (CombineFileSplit) splits.get(0);
        assertEquals(2, fileSplit.getNumPaths());
        assertEquals(1, fileSplit.getLocations().length);
        assertEquals(file1.getName(), fileSplit.getPath(0).getName());
        assertEquals(0, fileSplit.getOffset(0));
        assertEquals(BLOCKSIZE, fileSplit.getLength(0));
        assertEquals(file5.getName(), fileSplit.getPath(1).getName());
        assertEquals(0, fileSplit.getOffset(1));
        assertEquals(BLOCKSIZE, fileSplit.getLength(1));
        assertEquals(hosts1[0], fileSplit.getLocations()[0]);
        dfs.startDataNodes(conf, 1, true, null, rack2, hosts2, null);
        dfs.waitActive();
        // create file on two datanodes.
        Path file2 = new Path(dir2 + "/file2");
        writeFile(conf, file2, (short) 2, 2);
        // split it using a CombinedFile input format
        inFormat = new DummyInputFormat();
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2);
        inFormat.setMinSplitSizeRack(BLOCKSIZE);
        splits = inFormat.getSplits(job);
        System.out.println("Made splits(Test1): " + splits.size());
        for (InputSplit split : splits) {
            System.out.println("File split(Test1): " + split);
        }
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            /**
         * If rack1 is processed first by
         * {@link CombineFileInputFormat#createSplits},
         * create only one split on rack1. Otherwise create two splits.
         */
            if (splits.size() == 2) {
                // first split is on rack2, contains file2
                if (split.equals(splits.get(0))) {
                    assertEquals(2, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file2.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(file2.getName(), fileSplit.getPath(1).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(1));
                    assertEquals(hosts2[0], fileSplit.getLocations()[0]);
                }
                // second split is on rack1, contains file1
                if (split.equals(splits.get(1))) {
                    assertEquals(1, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file1.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 1) {
                // first split is on rack1, contains file1 and file2.
                assertEquals(3, fileSplit.getNumPaths());
                Set<Split> expected = new HashSet<>();
                expected.add(new Split(file1.getName(), BLOCKSIZE, 0));
                expected.add(new Split(file2.getName(), BLOCKSIZE, 0));
                expected.add(new Split(file2.getName(), BLOCKSIZE, BLOCKSIZE));
                List<Split> actual = new ArrayList<>();
                for (int i = 0; i < 3; i++) {
                    String name = fileSplit.getPath(i).getName();
                    long length = fileSplit.getLength(i);
                    long offset = fileSplit.getOffset(i);
                    actual.add(new Split(name, length, offset));
                }
                assertTrue(actual.containsAll(expected));
                assertEquals(1, fileSplit.getLocations().length);
                assertEquals(hosts1[0], fileSplit.getLocations()[0]);
            } else {
                fail("Expected split size is 1 or 2, but actual size is " + splits.size());
            }
        }
        // create another file on 3 datanodes and 3 racks.
        dfs.startDataNodes(conf, 1, true, null, rack3, hosts3, null);
        dfs.waitActive();
        Path file3 = new Path(dir3 + "/file3");
        writeFile(conf, new Path(dir3 + "/file3"), (short) 3, 3);
        inFormat = new DummyInputFormat();
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3);
        inFormat.setMinSplitSizeRack(BLOCKSIZE);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test2): " + split);
        }
        Set<Split> expected = new HashSet<>();
        expected.add(new Split(file1.getName(), BLOCKSIZE, 0));
        expected.add(new Split(file2.getName(), BLOCKSIZE, 0));
        expected.add(new Split(file2.getName(), BLOCKSIZE, BLOCKSIZE));
        expected.add(new Split(file3.getName(), BLOCKSIZE, 0));
        expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE));
        expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE * 2));
        List<Split> actual = new ArrayList<>();
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            /**
         * If rack1 is processed first by
         * {@link CombineFileInputFormat#createSplits},
         * create only one split on rack1.
         * If rack2 or rack3 is processed first and rack1 is processed second,
         * create one split on rack2 or rack3 and the other split is on rack1.
         * Otherwise create 3 splits for each rack.
         */
            if (splits.size() == 3) {
                // first split is on rack3, contains file3
                if (split.equals(splits.get(0))) {
                    assertEquals(3, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file3.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(file3.getName(), fileSplit.getPath(1).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(1));
                    assertEquals(file3.getName(), fileSplit.getPath(2).getName());
                    assertEquals(2 * BLOCKSIZE, fileSplit.getOffset(2));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(2));
                    assertEquals(hosts3[0], fileSplit.getLocations()[0]);
                }
                // second split is on rack2, contains file2
                if (split.equals(splits.get(1))) {
                    assertEquals(2, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file2.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(file2.getName(), fileSplit.getPath(1).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(1));
                    assertEquals(hosts2[0], fileSplit.getLocations()[0]);
                }
                // third split is on rack1, contains file1
                if (split.equals(splits.get(2))) {
                    assertEquals(1, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file1.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 2) {
                // first split is on rack2 or rack3, contains one or two files.
                if (split.equals(splits.get(0))) {
                    assertEquals(1, fileSplit.getLocations().length);
                    if (fileSplit.getLocations()[0].equals(hosts2[0])) {
                        assertEquals(2, fileSplit.getNumPaths());
                    } else if (fileSplit.getLocations()[0].equals(hosts3[0])) {
                        assertEquals(3, fileSplit.getNumPaths());
                    } else {
                        fail("First split should be on rack2 or rack3.");
                    }
                }
                // second split is on rack1, contains the rest files.
                if (split.equals(splits.get(1))) {
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 1) {
                // first split is rack1, contains all three files.
                assertEquals(1, fileSplit.getLocations().length);
                assertEquals(6, fileSplit.getNumPaths());
                assertEquals(hosts1[0], fileSplit.getLocations()[0]);
            } else {
                fail("Split size should be 1, 2, or 3.");
            }
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
        }
        assertEquals(6, actual.size());
        assertTrue(actual.containsAll(expected));
        // create file4 on all three racks
        Path file4 = new Path(dir4 + "/file4");
        writeFile(conf, file4, (short) 3, 3);
        inFormat = new DummyInputFormat();
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        inFormat.setMinSplitSizeRack(BLOCKSIZE);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test3): " + split);
        }
        expected.add(new Split(file4.getName(), BLOCKSIZE, 0));
        expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE));
        expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE * 2));
        actual.clear();
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            /**
         * If rack1 is processed first by
         * {@link CombineFileInputFormat#createSplits},
         * create only one split on rack1.
         * If rack2 or rack3 is processed first and rack1 is processed second,
         * create one split on rack2 or rack3 and the other split is on rack1.
         * Otherwise create 3 splits for each rack.
         */
            if (splits.size() == 3) {
                // first split is on rack3, contains file3 and file4
                if (split.equals(splits.get(0))) {
                    assertEquals(6, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts3[0], fileSplit.getLocations()[0]);
                }
                // second split is on rack2, contains file2
                if (split.equals(splits.get(1))) {
                    assertEquals(2, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file2.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(file2.getName(), fileSplit.getPath(1).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(1));
                    assertEquals(hosts2[0], fileSplit.getLocations()[0]);
                }
                // third split is on rack1, contains file1
                if (split.equals(splits.get(2))) {
                    assertEquals(1, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file1.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 2) {
                // first split is on rack2 or rack3, contains two or three files.
                if (split.equals(splits.get(0))) {
                    assertEquals(1, fileSplit.getLocations().length);
                    if (fileSplit.getLocations()[0].equals(hosts2[0])) {
                        assertEquals(5, fileSplit.getNumPaths());
                    } else if (fileSplit.getLocations()[0].equals(hosts3[0])) {
                        assertEquals(6, fileSplit.getNumPaths());
                    } else {
                        fail("First split should be on rack2 or rack3.");
                    }
                }
                // second split is on rack1, contains the rest files.
                if (split.equals(splits.get(1))) {
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 1) {
                // first split is rack1, contains all four files.
                assertEquals(1, fileSplit.getLocations().length);
                assertEquals(9, fileSplit.getNumPaths());
                assertEquals(hosts1[0], fileSplit.getLocations()[0]);
            } else {
                fail("Split size should be 1, 2, or 3.");
            }
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        // maximum split size is 2 blocks 
        inFormat = new DummyInputFormat();
        inFormat.setMinSplitSizeNode(BLOCKSIZE);
        inFormat.setMaxSplitSize(2 * BLOCKSIZE);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test4): " + split);
        }
        assertEquals(5, splits.size());
        actual.clear();
        reset(mockList);
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
            mockList.add(fileSplit.getLocations()[0]);
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        // verify the splits are on all the racks
        verify(mockList, atLeastOnce()).add(hosts1[0]);
        verify(mockList, atLeastOnce()).add(hosts2[0]);
        verify(mockList, atLeastOnce()).add(hosts3[0]);
        // maximum split size is 3 blocks 
        inFormat = new DummyInputFormat();
        inFormat.setMinSplitSizeNode(BLOCKSIZE);
        inFormat.setMaxSplitSize(3 * BLOCKSIZE);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test5): " + split);
        }
        assertEquals(3, splits.size());
        actual.clear();
        reset(mockList);
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
            mockList.add(fileSplit.getLocations()[0]);
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        verify(mockList, atLeastOnce()).add(hosts1[0]);
        verify(mockList, atLeastOnce()).add(hosts2[0]);
        // maximum split size is 4 blocks 
        inFormat = new DummyInputFormat();
        inFormat.setMaxSplitSize(4 * BLOCKSIZE);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test6): " + split);
        }
        assertEquals(3, splits.size());
        actual.clear();
        reset(mockList);
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
            mockList.add(fileSplit.getLocations()[0]);
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        verify(mockList, atLeastOnce()).add(hosts1[0]);
        // maximum split size is 7 blocks and min is 3 blocks
        inFormat = new DummyInputFormat();
        inFormat.setMaxSplitSize(7 * BLOCKSIZE);
        inFormat.setMinSplitSizeNode(3 * BLOCKSIZE);
        inFormat.setMinSplitSizeRack(3 * BLOCKSIZE);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test7): " + split);
        }
        assertEquals(2, splits.size());
        actual.clear();
        reset(mockList);
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
            mockList.add(fileSplit.getLocations()[0]);
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        verify(mockList, atLeastOnce()).add(hosts1[0]);
        // Rack 1 has file1, file2 and file3 and file4
        // Rack 2 has file2 and file3 and file4
        // Rack 3 has file3 and file4
        // setup a filter so that only (file1 and file2) or (file3 and file4)
        // can be combined
        inFormat = new DummyInputFormat();
        FileInputFormat.addInputPath(job, inDir);
        // everything is at least rack local
        inFormat.setMinSplitSizeRack(1);
        inFormat.createPool(new TestFilter(dir1), new TestFilter(dir2));
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test1): " + split);
        }
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            if (splits.size() == 2) {
                // first split is on rack1, contains file1 and file2.
                if (split.equals(splits.get(0))) {
                    assertEquals(3, fileSplit.getNumPaths());
                    expected.clear();
                    expected.add(new Split(file1.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file2.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file2.getName(), BLOCKSIZE, BLOCKSIZE));
                    actual.clear();
                    for (int i = 0; i < 3; i++) {
                        String name = fileSplit.getPath(i).getName();
                        long length = fileSplit.getLength(i);
                        long offset = fileSplit.getOffset(i);
                        actual.add(new Split(name, length, offset));
                    }
                    assertTrue(actual.containsAll(expected));
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
                if (split.equals(splits.get(1))) {
                    // second split contains the file3 and file4, however,
                    // the locations is undetermined.
                    assertEquals(6, fileSplit.getNumPaths());
                    expected.clear();
                    expected.add(new Split(file3.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE));
                    expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE * 2));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE * 2));
                    actual.clear();
                    for (int i = 0; i < 6; i++) {
                        String name = fileSplit.getPath(i).getName();
                        long length = fileSplit.getLength(i);
                        long offset = fileSplit.getOffset(i);
                        actual.add(new Split(name, length, offset));
                    }
                    assertTrue(actual.containsAll(expected));
                    assertEquals(1, fileSplit.getLocations().length);
                }
            } else if (splits.size() == 3) {
                if (split.equals(splits.get(0))) {
                    // first split is on rack2, contains file2
                    assertEquals(2, fileSplit.getNumPaths());
                    expected.clear();
                    expected.add(new Split(file2.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file2.getName(), BLOCKSIZE, BLOCKSIZE));
                    actual.clear();
                    for (int i = 0; i < 2; i++) {
                        String name = fileSplit.getPath(i).getName();
                        long length = fileSplit.getLength(i);
                        long offset = fileSplit.getOffset(i);
                        actual.add(new Split(name, length, offset));
                    }
                    assertTrue(actual.containsAll(expected));
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts2[0], fileSplit.getLocations()[0]);
                }
                if (split.equals(splits.get(1))) {
                    // second split is on rack1, contains file1
                    assertEquals(1, fileSplit.getNumPaths());
                    assertEquals(file1.getName(), fileSplit.getPath(0).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
                if (split.equals(splits.get(2))) {
                    // third split contains file3 and file4, however,
                    // the locations is undetermined.
                    assertEquals(6, fileSplit.getNumPaths());
                    expected.clear();
                    expected.add(new Split(file3.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE));
                    expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE * 2));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE * 2));
                    actual.clear();
                    for (int i = 0; i < 6; i++) {
                        String name = fileSplit.getPath(i).getName();
                        long length = fileSplit.getLength(i);
                        long offset = fileSplit.getOffset(i);
                        actual.add(new Split(name, length, offset));
                    }
                    assertTrue(actual.containsAll(expected));
                    assertEquals(1, fileSplit.getLocations().length);
                }
            } else {
                fail("Split size should be 2 or 3.");
            }
        }
        // measure performance when there are multiple pools and
        // many files in each pool.
        int numPools = 100;
        int numFiles = 1000;
        DummyInputFormat1 inFormat1 = new DummyInputFormat1();
        for (int i = 0; i < numFiles; i++) {
            FileInputFormat.setInputPaths(job, file1);
        }
        // everything is at least rack local
        inFormat1.setMinSplitSizeRack(1);
        final Path dirNoMatch1 = new Path(inDir, "/dirxx");
        final Path dirNoMatch2 = new Path(inDir, "/diryy");
        for (int i = 0; i < numPools; i++) {
            inFormat1.createPool(new TestFilter(dirNoMatch1), new TestFilter(dirNoMatch2));
        }
        long start = System.currentTimeMillis();
        splits = inFormat1.getSplits(job);
        long end = System.currentTimeMillis();
        System.out.println("Elapsed time for " + numPools + " pools " + " and " + numFiles + " files is " + ((end - start) / 1000) + " seconds.");
        // This file has three whole blocks. If the maxsplit size is
        // half the block size, then there should be six splits.
        inFormat = new DummyInputFormat();
        inFormat.setMaxSplitSize(BLOCKSIZE / 2);
        FileInputFormat.setInputPaths(job, dir3);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test8): " + split);
        }
        assertEquals(splits.size(), 6);
    } finally {
        if (dfs != null) {
            dfs.shutdown();
        }
    }
}

Also used : Set(java.util.Set) HashSet(java.util.HashSet) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) List(java.util.List) ArrayList(java.util.ArrayList) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Aggregations

Job (org.apache.hadoop.mapreduce.Job)886 Path (org.apache.hadoop.fs.Path)498 Configuration (org.apache.hadoop.conf.Configuration)434 Test (org.junit.Test)259 IOException (java.io.IOException)135 FileSystem (org.apache.hadoop.fs.FileSystem)128 File (java.io.File)77 InputSplit (org.apache.hadoop.mapreduce.InputSplit)58 ArrayList (java.util.ArrayList)55 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)55 Scan (org.apache.hadoop.hbase.client.Scan)45 FileStatus (org.apache.hadoop.fs.FileStatus)44 NutchJob (org.apache.nutch.util.NutchJob)43 JobConf (org.apache.hadoop.mapred.JobConf)42 Text (org.apache.hadoop.io.Text)39 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)36 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)35 JobContext (org.apache.hadoop.mapreduce.JobContext)35 GenericOptionsParser (org.apache.hadoop.util.GenericOptionsParser)35 CommandLine (org.apache.commons.cli.CommandLine)33