Search in sources :

Example 21 with Cluster

use of org.apache.hadoop.mapreduce.Cluster in project hadoop by apache.

the class DistCp method createMetaFolderPath.

/**
   * Create a default working folder for the job, under the
   * job staging directory
   *
   * @return Returns the working folder information
   * @throws Exception - Exception if any
   */
private Path createMetaFolderPath() throws Exception {
    Configuration configuration = getConf();
    Path stagingDir = JobSubmissionFiles.getStagingDir(new Cluster(configuration), configuration);
    Path metaFolderPath = new Path(stagingDir, PREFIX + String.valueOf(rand.nextInt()));
    if (LOG.isDebugEnabled())
        LOG.debug("Meta folder location: " + metaFolderPath);
    configuration.set(DistCpConstants.CONF_LABEL_META_FOLDER, metaFolderPath.toString());
    return metaFolderPath;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Cluster(org.apache.hadoop.mapreduce.Cluster)

Example 22 with Cluster

use of org.apache.hadoop.mapreduce.Cluster in project hadoop by apache.

the class HadoopArchives method archive.

/**archive the given source paths into
   * the dest
   * @param parentPath the parent path of all the source paths
   * @param srcPaths the src paths to be archived
   * @param dest the dest dir that will contain the archive
   */
void archive(Path parentPath, List<Path> srcPaths, String archiveName, Path dest) throws IOException {
    checkPaths(conf, srcPaths);
    int numFiles = 0;
    long totalSize = 0;
    FileSystem fs = parentPath.getFileSystem(conf);
    this.blockSize = conf.getLong(HAR_BLOCKSIZE_LABEL, blockSize);
    this.partSize = conf.getLong(HAR_PARTSIZE_LABEL, partSize);
    conf.setLong(HAR_BLOCKSIZE_LABEL, blockSize);
    conf.setLong(HAR_PARTSIZE_LABEL, partSize);
    conf.set(DST_HAR_LABEL, archiveName);
    conf.set(SRC_PARENT_LABEL, parentPath.makeQualified(fs).toString());
    conf.setInt(HAR_REPLICATION_LABEL, repl);
    Path outputPath = new Path(dest, archiveName);
    FileOutputFormat.setOutputPath(conf, outputPath);
    FileSystem outFs = outputPath.getFileSystem(conf);
    if (outFs.exists(outputPath)) {
        throw new IOException("Archive path: " + outputPath.toString() + " already exists");
    }
    if (outFs.isFile(dest)) {
        throw new IOException("Destination " + dest.toString() + " should be a directory but is a file");
    }
    conf.set(DST_DIR_LABEL, outputPath.toString());
    Path stagingArea;
    try {
        stagingArea = JobSubmissionFiles.getStagingDir(new Cluster(conf), conf);
    } catch (InterruptedException ie) {
        throw new IOException(ie);
    }
    Path jobDirectory = new Path(stagingArea, NAME + "_" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE), 36));
    FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
    FileSystem.mkdirs(jobDirectory.getFileSystem(conf), jobDirectory, mapredSysPerms);
    conf.set(JOB_DIR_LABEL, jobDirectory.toString());
    //get a tmp directory for input splits
    FileSystem jobfs = jobDirectory.getFileSystem(conf);
    Path srcFiles = new Path(jobDirectory, "_har_src_files");
    conf.set(SRC_LIST_LABEL, srcFiles.toString());
    SequenceFile.Writer srcWriter = SequenceFile.createWriter(jobfs, conf, srcFiles, LongWritable.class, HarEntry.class, SequenceFile.CompressionType.NONE);
    // create single list of files and dirs
    try {
        // write the top level dirs in first 
        writeTopLevelDirs(srcWriter, srcPaths, parentPath);
        srcWriter.sync();
        // one at a time
        for (Path src : srcPaths) {
            ArrayList<FileStatusDir> allFiles = new ArrayList<FileStatusDir>();
            FileStatus fstatus = fs.getFileStatus(src);
            FileStatusDir fdir = new FileStatusDir(fstatus, null);
            recursivels(fs, fdir, allFiles);
            for (FileStatusDir statDir : allFiles) {
                FileStatus stat = statDir.getFileStatus();
                long len = stat.isDirectory() ? 0 : stat.getLen();
                final Path path = relPathToRoot(stat.getPath(), parentPath);
                final String[] children;
                if (stat.isDirectory()) {
                    //get the children 
                    FileStatus[] list = statDir.getChildren();
                    children = new String[list.length];
                    for (int i = 0; i < list.length; i++) {
                        children[i] = list[i].getPath().getName();
                    }
                } else {
                    children = null;
                }
                append(srcWriter, len, path.toString(), children);
                srcWriter.sync();
                numFiles++;
                totalSize += len;
            }
        }
    } finally {
        srcWriter.close();
    }
    conf.setInt(SRC_COUNT_LABEL, numFiles);
    conf.setLong(TOTAL_SIZE_LABEL, totalSize);
    int numMaps = (int) (totalSize / partSize);
    //run atleast one map.
    conf.setNumMapTasks(numMaps == 0 ? 1 : numMaps);
    conf.setNumReduceTasks(1);
    conf.setInputFormat(HArchiveInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(HArchivesMapper.class);
    conf.setReducerClass(HArchivesReducer.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    FileInputFormat.addInputPath(conf, jobDirectory);
    //make sure no speculative execution is done
    conf.setSpeculativeExecution(false);
    JobClient.runJob(conf);
    //delete the tmp job directory
    try {
        jobfs.delete(jobDirectory, true);
    } catch (IOException ie) {
        LOG.info("Unable to clean tmp directory " + jobDirectory);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) Cluster(org.apache.hadoop.mapreduce.Cluster) IOException(java.io.IOException) Random(java.util.Random) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) HarFileSystem(org.apache.hadoop.fs.HarFileSystem) FsPermission(org.apache.hadoop.fs.permission.FsPermission)

Aggregations

Cluster (org.apache.hadoop.mapreduce.Cluster)22 Test (org.junit.Test)17 Configuration (org.apache.hadoop.conf.Configuration)12 Job (org.apache.hadoop.mapreduce.Job)11 Path (org.apache.hadoop.fs.Path)5 IOException (java.io.IOException)4 JobID (org.apache.hadoop.mapreduce.JobID)4 TaskReport (org.apache.hadoop.mapreduce.TaskReport)4 ArrayList (java.util.ArrayList)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 PrintWriter (java.io.PrintWriter)1 Random (java.util.Random)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 HarFileSystem (org.apache.hadoop.fs.HarFileSystem)1 FsPermission (org.apache.hadoop.fs.permission.FsPermission)1 BackupCopyJob (org.apache.hadoop.hbase.backup.BackupCopyJob)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 JobStatus (org.apache.hadoop.mapreduce.JobStatus)1 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)1