Search in sources :

Example 36 with Counters

use of org.apache.hadoop.mapreduce.Counters in project mavuno by metzlerd.

the class ProcessStanfordNLP method run.

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();
    // required parameters
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf);
    // optional parameters
    String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf);
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf);
    sLogger.info("Tool name: ProcessStanfordNLP");
    sLogger.info(" - Input path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);
    if (suTime != null && Boolean.parseBoolean(suTime)) {
        sLogger.info("- SUTime enabled");
    }
    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        sLogger.info("- Text output format enabled");
        textOutputFormat = true;
    }
    Job job = new Job(conf);
    job.setJobName("ProcessStanfordNLP");
    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StanfordParsedDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StanfordParsedDocument.class);
    job.setMapperClass(MyMapper.class);
    job.setJarByClass(ProcessStanfordNLP.class);
    // no reducers needed
    job.setNumReduceTasks(0);
    // run job
    job.waitForCompletion(true);
    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue());
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Counters(org.apache.hadoop.mapreduce.Counters) Job(org.apache.hadoop.mapreduce.Job)

Example 37 with Counters

use of org.apache.hadoop.mapreduce.Counters in project mavuno by metzlerd.

the class TratzParse method run.

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Parse.OutputPath", conf);
    // optional parameter that allows the parsed documents to be output in text format
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.Parse.TextOutputFormat", conf);
    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        textOutputFormat = true;
    }
    sLogger.info("Tool name: TratzParse");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);
    Job job = new Job(conf);
    job.setJobName("TratzParse");
    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(TratzParsedDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TratzParsedDocument.class);
    job.setMapperClass(MyMapper.class);
    job.setJarByClass(TratzParse.class);
    // no reducers needed
    job.setNumReduceTasks(0);
    // run job
    job.waitForCompletion(true);
    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(StatCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(StatCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(StatCounters.TOTAL_TOKENS).getValue());
    sLogger.info(" - Total dropped sentences: " + counters.findCounter(StatCounters.TOTAL_DROPPED_SENTENCES).getValue());
    sLogger.info(" - Total tokenization time (ms): " + counters.findCounter(StatCounters.TOKENIZE_TIME).getValue());
    sLogger.info(" - Total POS tagging time (ms): " + counters.findCounter(StatCounters.POSTAG_TIME).getValue());
    sLogger.info(" - Total chunking time (ms): " + counters.findCounter(StatCounters.CHUNK_TIME).getValue());
    sLogger.info(" - Total named entity tagging time (ms): " + counters.findCounter(StatCounters.NETAG_TIME).getValue());
    sLogger.info(" - Total parse time (ms): " + counters.findCounter(StatCounters.PARSE_TIME).getValue());
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Counters(org.apache.hadoop.mapreduce.Counters) Job(org.apache.hadoop.mapreduce.Job)

Example 38 with Counters

use of org.apache.hadoop.mapreduce.Counters in project hadoop by apache.

the class TaskAttemptImpl method initTaskAttemptStatus.

private void initTaskAttemptStatus(TaskAttemptStatus result) {
    result.progress = 0.0f;
    result.phase = Phase.STARTING;
    result.stateString = "NEW";
    result.taskState = TaskAttemptState.NEW;
    Counters counters = EMPTY_COUNTERS;
    result.counters = counters;
}
Also used : Counters(org.apache.hadoop.mapreduce.Counters)

Example 39 with Counters

use of org.apache.hadoop.mapreduce.Counters in project hadoop by apache.

the class JobImpl method getAllCounters.

@Override
public Counters getAllCounters() {
    readLock.lock();
    try {
        JobStateInternal state = getInternalState();
        if (state == JobStateInternal.ERROR || state == JobStateInternal.FAILED || state == JobStateInternal.KILLED || state == JobStateInternal.SUCCEEDED) {
            this.mayBeConstructFinalFullCounters();
            return fullCounters;
        }
        Counters counters = new Counters();
        counters.incrAllCounters(jobCounters);
        return incrTaskCounters(counters, tasks.values());
    } finally {
        readLock.unlock();
    }
}
Also used : JobStateInternal(org.apache.hadoop.mapreduce.v2.app.job.JobStateInternal) Counters(org.apache.hadoop.mapreduce.Counters)

Example 40 with Counters

use of org.apache.hadoop.mapreduce.Counters in project hadoop by apache.

the class TaskAttemptImpl method updateProgressSplits.

private void updateProgressSplits() {
    double newProgress = reportedStatus.progress;
    newProgress = Math.max(Math.min(newProgress, 1.0D), 0.0D);
    Counters counters = reportedStatus.counters;
    if (counters == null)
        return;
    WrappedProgressSplitsBlock splitsBlock = getProgressSplitBlock();
    if (splitsBlock != null) {
        long now = clock.getTime();
        // TODO Ensure not 0
        long start = getLaunchTime();
        if (start != 0 && now - start <= Integer.MAX_VALUE) {
            splitsBlock.getProgressWallclockTime().extend(newProgress, (int) (now - start));
        }
        Counter cpuCounter = counters.findCounter(TaskCounter.CPU_MILLISECONDS);
        if (cpuCounter != null && cpuCounter.getValue() <= Integer.MAX_VALUE) {
            splitsBlock.getProgressCPUTime().extend(newProgress, // long to int? TODO: FIX. Same below
            (int) cpuCounter.getValue());
        }
        Counter virtualBytes = counters.findCounter(TaskCounter.VIRTUAL_MEMORY_BYTES);
        if (virtualBytes != null) {
            splitsBlock.getProgressVirtualMemoryKbytes().extend(newProgress, (int) (virtualBytes.getValue() / (MEMORY_SPLITS_RESOLUTION)));
        }
        Counter physicalBytes = counters.findCounter(TaskCounter.PHYSICAL_MEMORY_BYTES);
        if (physicalBytes != null) {
            splitsBlock.getProgressPhysicalMemoryKbytes().extend(newProgress, (int) (physicalBytes.getValue() / (MEMORY_SPLITS_RESOLUTION)));
        }
    }
}
Also used : JobCounter(org.apache.hadoop.mapreduce.JobCounter) Counter(org.apache.hadoop.mapreduce.Counter) TaskCounter(org.apache.hadoop.mapreduce.TaskCounter) Counters(org.apache.hadoop.mapreduce.Counters) WrappedProgressSplitsBlock(org.apache.hadoop.mapred.WrappedProgressSplitsBlock)

Aggregations

Counters (org.apache.hadoop.mapreduce.Counters)72 Test (org.junit.Test)24 Job (org.apache.hadoop.mapreduce.Job)21 Path (org.apache.hadoop.fs.Path)14 Configuration (org.apache.hadoop.conf.Configuration)13 Counter (org.apache.hadoop.mapreduce.Counter)11 Task (org.apache.hadoop.mapreduce.v2.app.job.Task)8 TaskId (org.apache.hadoop.mapreduce.v2.api.records.TaskId)7 PhoenixScrutinyJobCounters (org.apache.phoenix.mapreduce.index.PhoenixScrutinyJobCounters)7 BaseTest (org.apache.phoenix.query.BaseTest)7 IOException (java.io.IOException)6 TaskAttemptId (org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId)6 TaskAttempt (org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt)6 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)6 HdfsConfiguration (org.apache.hadoop.hdfs.HdfsConfiguration)5 TableName (org.apache.hadoop.hbase.TableName)4 JobId (org.apache.hadoop.mapreduce.v2.api.records.JobId)4 File (java.io.File)3 URI (java.net.URI)3 FileSystem (org.apache.hadoop.fs.FileSystem)3