Search in sources :

Example 6 with Counters

use of org.apache.hadoop.mapred.Counters in project Cloud9 by lintool.

the class HadoopAlign method doAlignment.

@SuppressWarnings("deprecation")
public static void doAlignment(int mapTasks, int reduceTasks, HadoopAlignConfig hac) throws IOException {
    System.out.println("Running alignment: " + hac);
    FileSystem fs = FileSystem.get(hac);
    Path cbtxt = new Path(hac.getRoot() + "/comp-bitext");
    //		fs.delete(cbtxt, true);
    if (!fs.exists(cbtxt)) {
        CorpusVocabNormalizerAndNumberizer.preprocessAndNumberizeFiles(hac, hac.getBitexts(), cbtxt);
    }
    System.out.println("Finished preprocessing");
    int m1iters = hac.getModel1Iterations();
    int hmmiters = hac.getHMMIterations();
    int totalIterations = m1iters + hmmiters;
    String modelType = null;
    ArrayList<Double> perps = new ArrayList<Double>();
    ArrayList<Double> aers = new ArrayList<Double>();
    boolean hmm = false;
    boolean firstHmm = true;
    Path model1PosteriorsPath = null;
    for (int iteration = 0; iteration < totalIterations; iteration++) {
        long start = System.currentTimeMillis();
        hac.setBoolean("ha.generate.posterios", false);
        boolean lastIteration = (iteration == totalIterations - 1);
        boolean lastModel1Iteration = (iteration == m1iters - 1);
        if (iteration >= m1iters)
            hmm = true;
        if (hmm)
            modelType = "HMM";
        else
            modelType = "Model1";
        FileSystem fileSys = FileSystem.get(hac);
        String sOutputPath = modelType + ".data." + iteration;
        Path outputPath = new Path(sOutputPath);
        try {
            if (// no probs in first iteration!
            usePServer && iteration > 0)
                startPServers(hac);
            System.out.println("Starting iteration " + iteration + (iteration == 0 ? " (initialization)" : "") + ": " + modelType);
            JobConf conf = new JobConf(hac, HadoopAlign.class);
            conf.setJobName("EMTrain." + modelType + ".iter" + iteration);
            conf.setInputFormat(SequenceFileInputFormat.class);
            conf.set(KEY_TRAINER, MODEL1_TRAINER);
            conf.set(KEY_ITERATION, Integer.toString(iteration));
            conf.set("mapred.child.java.opts", "-Xmx2048m");
            if (iteration == 0)
                conf.set(KEY_TRAINER, MODEL1_UNIFORM_INIT);
            if (hmm) {
                conf.set(KEY_TRAINER, HMM_TRAINER);
                if (firstHmm) {
                    firstHmm = false;
                    System.out.println("Writing default a-table...");
                    Path pathATable = hac.getATablePath();
                    fileSys.delete(pathATable, true);
                    DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fileSys.create(pathATable)));
                    int cond_values = 1;
                    if (!hac.isHMMHomogeneous()) {
                        cond_values = 100;
                    }
                    ATable at = new ATable(hac.isHMMHomogeneous(), cond_values, 100);
                    at.normalize();
                    at.write(dos);
                    //			System.out.println(at);
                    dos.close();
                }
            }
            conf.setOutputKeyClass(IntWritable.class);
            conf.setOutputValueClass(PartialCountContainer.class);
            conf.setMapperClass(EMapper.class);
            conf.setReducerClass(EMReducer.class);
            conf.setNumMapTasks(mapTasks);
            conf.setNumReduceTasks(reduceTasks);
            System.out.println("Running job " + conf.getJobName());
            // otherwise, input is set to output of last model 1 iteration
            if (model1PosteriorsPath != null) {
                System.out.println("Input: " + model1PosteriorsPath);
                FileInputFormat.setInputPaths(conf, model1PosteriorsPath);
            } else {
                System.out.println("Input: " + cbtxt);
                FileInputFormat.setInputPaths(conf, cbtxt);
            }
            System.out.println("Output: " + outputPath);
            FileOutputFormat.setOutputPath(conf, new Path(hac.getRoot() + "/" + outputPath.toString()));
            fileSys.delete(new Path(hac.getRoot() + "/" + outputPath.toString()), true);
            conf.setOutputFormat(SequenceFileOutputFormat.class);
            RunningJob job = JobClient.runJob(conf);
            Counters c = job.getCounters();
            double lp = c.getCounter(CrossEntropyCounters.LOGPROB);
            double wc = c.getCounter(CrossEntropyCounters.WORDCOUNT);
            double ce = lp / wc / Math.log(2);
            double perp = Math.pow(2.0, ce);
            double aer = ComputeAER(c);
            System.out.println("Iteration " + iteration + ": (" + modelType + ")\tCROSS-ENTROPY: " + ce + "   PERPLEXITY: " + perp);
            System.out.println("Iteration " + iteration + ": " + aer + " AER");
            aers.add(aer);
            perps.add(perp);
        } finally {
            stopPServers();
        }
        JobConf conf = new JobConf(hac, ModelMergeMapper2.class);
        System.err.println("Setting " + TTABLE_ITERATION_OUTPUT + " to " + outputPath.toString());
        conf.set(TTABLE_ITERATION_OUTPUT, hac.getRoot() + "/" + outputPath.toString());
        conf.setJobName("EMTrain.ModelMerge");
        //			conf.setOutputKeyClass(LongWritable.class);
        conf.setMapperClass(ModelMergeMapper2.class);
        conf.setSpeculativeExecution(false);
        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(0);
        conf.setInputFormat(NullInputFormat.class);
        conf.setOutputFormat(NullOutputFormat.class);
        conf.set("mapred.map.child.java.opts", "-Xmx2048m");
        conf.set("mapred.reduce.child.java.opts", "-Xmx2048m");
        //			FileInputFormat.setInputPaths(conf, root+"/dummy");
        //			fileSys.delete(new Path(root+"/dummy.out"), true);
        //			FileOutputFormat.setOutputPath(conf, new Path(root+"/dummy.out"));
        //			conf.setOutputFormat(SequenceFileOutputFormat.class);
        System.out.println("Running job " + conf.getJobName());
        System.out.println("Input: " + hac.getRoot() + "/dummy");
        System.out.println("Output: " + hac.getRoot() + "/dummy.out");
        JobClient.runJob(conf);
        fileSys.delete(new Path(hac.getRoot() + "/" + outputPath.toString()), true);
        if (lastIteration || lastModel1Iteration) {
            //hac.setBoolean("ha.generate.posteriors", true);
            conf = new JobConf(hac, HadoopAlign.class);
            sOutputPath = modelType + ".data." + iteration;
            outputPath = new Path(sOutputPath);
            conf.setJobName(modelType + ".align");
            conf.set("mapred.map.child.java.opts", "-Xmx2048m");
            conf.set("mapred.reduce.child.java.opts", "-Xmx2048m");
            // TODO use file cache
            /*try {
					if (hmm || iteration > 0) {
						URI ttable = new URI(fileSys.getHomeDirectory() + Path.SEPARATOR + hac.getTTablePath().toString());
						DistributedCache.addCacheFile(ttable, conf);
						System.out.println("cache<-- " + ttable);
					}

				} catch (Exception e) { throw new RuntimeException("Caught " + e); }
         */
            conf.setInputFormat(SequenceFileInputFormat.class);
            conf.setOutputFormat(SequenceFileOutputFormat.class);
            conf.set(KEY_TRAINER, MODEL1_TRAINER);
            conf.set(KEY_ITERATION, Integer.toString(iteration));
            if (hmm)
                conf.set(KEY_TRAINER, HMM_TRAINER);
            conf.setOutputKeyClass(Text.class);
            conf.setOutputValueClass(PhrasePair.class);
            conf.setMapperClass(AlignMapper.class);
            conf.setReducerClass(IdentityReducer.class);
            conf.setNumMapTasks(mapTasks);
            conf.setNumReduceTasks(reduceTasks);
            FileOutputFormat.setOutputPath(conf, new Path(hac.getRoot() + "/" + outputPath.toString()));
            //if last model1 iteration, save output path, to be used as input path in later iterations
            if (lastModel1Iteration) {
                FileInputFormat.setInputPaths(conf, cbtxt);
                model1PosteriorsPath = new Path(hac.getRoot() + "/" + outputPath.toString());
            } else {
                FileInputFormat.setInputPaths(conf, model1PosteriorsPath);
            }
            fileSys.delete(outputPath, true);
            System.out.println("Running job " + conf.getJobName());
            RunningJob job = JobClient.runJob(conf);
            System.out.println("GENERATED: " + model1PosteriorsPath);
            Counters c = job.getCounters();
            double aer = ComputeAER(c);
            //				System.out.println("Iteration " + iteration + ": (" + modelType + ")\tCROSS-ENTROPY: " + ce + "   PERPLEXITY: " + perp);
            System.out.println("Iteration " + iteration + ": " + aer + " AER");
            aers.add(aer);
            perps.add(0.0);
        }
        long end = System.currentTimeMillis();
        System.out.println(modelType + " iteration " + iteration + " took " + ((end - start) / 1000) + " seconds.");
    }
    for (int i = 0; i < perps.size(); i++) {
        System.out.print("I=" + i + "\t");
        if (aers.size() > 0) {
            System.out.print(aers.get(i) + "\t");
        }
        System.out.println(perps.get(i));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) DataOutputStream(java.io.DataOutputStream) ArrayList(java.util.ArrayList) ATable(edu.umd.hooka.alignment.hmm.ATable) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) JobConf(org.apache.hadoop.mapred.JobConf) BufferedOutputStream(java.io.BufferedOutputStream)

Example 7 with Counters

use of org.apache.hadoop.mapred.Counters in project Cloud9 by lintool.

the class BuildAnchorTextForwardIndex method run.

/**
	 * Runs this tool.
	 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }
    JobConf conf = new JobConf(getConf());
    FileSystem fs = FileSystem.get(conf);
    String collectionPath = args[0];
    String outputPath = args[1];
    String indexFile = args[2];
    LOG.info("Tool name: BuildAnchorTextForwardIndex");
    LOG.info(" - collection path: " + collectionPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setJobName("BuildAnchorTextForwardIndex");
    conf.setNumMapTasks(100);
    conf.setNumReduceTasks(1);
    FileInputFormat.setInputPaths(conf, new Path(collectionPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);
    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);
    // delete the output directory if it exists already
    fs.delete(new Path(outputPath), true);
    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();
    LOG.info("number of blocks: " + blocks);
    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);
    out.writeUTF(IndexableAnchorTextForwardIndex.class.getName());
    out.writeUTF(collectionPath);
    out.writeInt(blocks);
    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");
        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);
        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);
        cnt++;
        if (cnt % 1000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }
    reader.close();
    out.close();
    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) AnchorText(edu.umd.cloud9.webgraph.data.AnchorText) IndexableAnchorTextForwardIndex(edu.umd.cloud9.webgraph.data.IndexableAnchorTextForwardIndex) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 8 with Counters

use of org.apache.hadoop.mapred.Counters in project ambrose by twitter.

the class CascadingJob method setJobStats.

@JsonIgnore
public void setJobStats(HadoopStepStats stats) {
    Counters counters = new Counters();
    for (String groupName : stats.getCounterGroups()) {
        for (String counterName : stats.getCountersFor(groupName)) {
            Long counterValue = stats.getCounterValue(groupName, counterName);
            counters.findCounter(groupName, counterName).setValue(counterValue);
        }
    }
    setCounterGroupMap(CounterGroup.counterGroupsByName(counters));
}
Also used : Counters(org.apache.hadoop.mapred.Counters) JsonIgnore(com.fasterxml.jackson.annotation.JsonIgnore)

Example 9 with Counters

use of org.apache.hadoop.mapred.Counters in project incubator-systemml by apache.

the class GenTfMtdMR method runJob.

public static long runJob(String inputPath, String txMtdPath, String specWithIDs, String smallestFile, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException {
    JobConf job = new JobConf(GenTfMtdMR.class);
    job.setJobName("GenTfMTD");
    /* Setup MapReduce Job */
    job.setJarByClass(GenTfMtdMR.class);
    // set relevant classes
    job.setMapperClass(GTFMTDMapper.class);
    job.setReducerClass(GTFMTDReducer.class);
    // set input and output properties
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DistinctValue.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // delete outputPath, if exists already.
    Path outPath = new Path(txMtdPath);
    FileSystem fs = IOUtilFunctions.getFileSystem(outPath, job);
    fs.delete(outPath, true);
    FileOutputFormat.setOutputPath(job, outPath);
    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    if (inputDataProperties.getNAStrings() != null)
        // Adding "dummy" string to handle the case of na_strings = ""
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC, specWithIDs);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, smallestFile);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, txMtdPath);
    // offsets file to store part-file names and offsets for each input split
    job.set(MRJobConfiguration.TF_OFFSETS_FILE, partOffsetsFile);
    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);
    // Run the job
    RunningJob runjob = JobClient.runJob(job);
    Counters c = runjob.getCounters();
    long tx_numRows = c.findCounter(MRJobConfiguration.DataTransformCounters.TRANSFORMED_NUM_ROWS).getCounter();
    return tx_numRows;
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob) Counters(org.apache.hadoop.mapred.Counters) JobConf(org.apache.hadoop.mapred.JobConf)

Example 10 with Counters

use of org.apache.hadoop.mapred.Counters in project voldemort by voldemort.

the class AbstractHadoopJob method run.

public void run(JobConf conf) throws Exception {
    _runningJob = new JobClient(conf).submitJob(conf);
    info("See " + _runningJob.getTrackingURL() + " for details.");
    _runningJob.waitForCompletion();
    if (!_runningJob.isSuccessful()) {
        throw new Exception("Hadoop job:" + getId() + " failed!");
    }
    // dump all counters
    Counters counters = _runningJob.getCounters();
    for (String groupName : counters.getGroupNames()) {
        Counters.Group group = counters.getGroup(groupName);
        info("Group: " + group.getDisplayName());
        for (Counter counter : group) info(counter.getDisplayName() + ":\t" + counter.getValue());
    }
}
Also used : Counter(org.apache.hadoop.mapred.Counters.Counter) Counters(org.apache.hadoop.mapred.Counters) JobClient(org.apache.hadoop.mapred.JobClient) URISyntaxException(java.net.URISyntaxException) IOException(java.io.IOException)

Aggregations

Counters (org.apache.hadoop.mapred.Counters)23 RunningJob (org.apache.hadoop.mapred.RunningJob)14 Path (org.apache.hadoop.fs.Path)13 JobConf (org.apache.hadoop.mapred.JobConf)12 FileSystem (org.apache.hadoop.fs.FileSystem)11 Counter (org.apache.hadoop.mapred.Counters.Counter)6 JobClient (org.apache.hadoop.mapred.JobClient)6 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 Text (org.apache.hadoop.io.Text)5 FileStatus (org.apache.hadoop.fs.FileStatus)4 LineReader (org.apache.hadoop.util.LineReader)4 BufferedReader (java.io.BufferedReader)3 DataOutputStream (java.io.DataOutputStream)3 IOException (java.io.IOException)3 InputStreamReader (java.io.InputStreamReader)3 ArrayList (java.util.ArrayList)3 CommandLine (org.apache.commons.cli.CommandLine)3 CommandLineParser (org.apache.commons.cli.CommandLineParser)3 GnuParser (org.apache.commons.cli.GnuParser)3 HelpFormatter (org.apache.commons.cli.HelpFormatter)3