use of org.apache.hadoop.mapreduce.Counters in project mavuno by metzlerd.
the class ProcessStanfordNLP method run.
@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
Configuration conf = getConf();
// required parameters
String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf);
String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf);
String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf);
// optional parameters
String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf);
String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf);
sLogger.info("Tool name: ProcessStanfordNLP");
sLogger.info(" - Input path: " + corpusPath);
sLogger.info(" - Corpus class: " + corpusClass);
sLogger.info(" - Output path: " + outputPath);
if (suTime != null && Boolean.parseBoolean(suTime)) {
sLogger.info("- SUTime enabled");
}
boolean textOutputFormat = false;
if (textOutput != null && Boolean.parseBoolean(textOutput)) {
sLogger.info("- Text output format enabled");
textOutputFormat = true;
}
Job job = new Job(conf);
job.setJobName("ProcessStanfordNLP");
MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
// output format -- either plain text or sequencefile (default)
if (textOutputFormat) {
job.setOutputFormatClass(TextOutputFormat.class);
} else {
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(StanfordParsedDocument.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(StanfordParsedDocument.class);
job.setMapperClass(MyMapper.class);
job.setJarByClass(ProcessStanfordNLP.class);
// no reducers needed
job.setNumReduceTasks(0);
// run job
job.waitForCompletion(true);
// print job statistics
Counters counters = job.getCounters();
sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue());
sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue());
sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue());
return 0;
}
use of org.apache.hadoop.mapreduce.Counters in project mavuno by metzlerd.
the class TratzParse method run.
@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
Configuration conf = getConf();
String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusPath", conf);
String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusClass", conf);
String outputPath = MavunoUtils.getRequiredParam("Mavuno.Parse.OutputPath", conf);
// optional parameter that allows the parsed documents to be output in text format
String textOutput = MavunoUtils.getOptionalParam("Mavuno.Parse.TextOutputFormat", conf);
boolean textOutputFormat = false;
if (textOutput != null && Boolean.parseBoolean(textOutput)) {
textOutputFormat = true;
}
sLogger.info("Tool name: TratzParse");
sLogger.info(" - Corpus path: " + corpusPath);
sLogger.info(" - Corpus class: " + corpusClass);
sLogger.info(" - Output path: " + outputPath);
Job job = new Job(conf);
job.setJobName("TratzParse");
MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
// output format -- either plain text or sequencefile (default)
if (textOutputFormat) {
job.setOutputFormatClass(TextOutputFormat.class);
} else {
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileOutputFormat.setCompressOutput(job, true);
SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TratzParsedDocument.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TratzParsedDocument.class);
job.setMapperClass(MyMapper.class);
job.setJarByClass(TratzParse.class);
// no reducers needed
job.setNumReduceTasks(0);
// run job
job.waitForCompletion(true);
// print job statistics
Counters counters = job.getCounters();
sLogger.info(" - Total documents: " + counters.findCounter(StatCounters.TOTAL_DOCUMENTS).getValue());
sLogger.info(" - Total sentences: " + counters.findCounter(StatCounters.TOTAL_SENTENCES).getValue());
sLogger.info(" - Total tokens: " + counters.findCounter(StatCounters.TOTAL_TOKENS).getValue());
sLogger.info(" - Total dropped sentences: " + counters.findCounter(StatCounters.TOTAL_DROPPED_SENTENCES).getValue());
sLogger.info(" - Total tokenization time (ms): " + counters.findCounter(StatCounters.TOKENIZE_TIME).getValue());
sLogger.info(" - Total POS tagging time (ms): " + counters.findCounter(StatCounters.POSTAG_TIME).getValue());
sLogger.info(" - Total chunking time (ms): " + counters.findCounter(StatCounters.CHUNK_TIME).getValue());
sLogger.info(" - Total named entity tagging time (ms): " + counters.findCounter(StatCounters.NETAG_TIME).getValue());
sLogger.info(" - Total parse time (ms): " + counters.findCounter(StatCounters.PARSE_TIME).getValue());
return 0;
}
use of org.apache.hadoop.mapreduce.Counters in project hadoop by apache.
the class TaskAttemptImpl method initTaskAttemptStatus.
private void initTaskAttemptStatus(TaskAttemptStatus result) {
result.progress = 0.0f;
result.phase = Phase.STARTING;
result.stateString = "NEW";
result.taskState = TaskAttemptState.NEW;
Counters counters = EMPTY_COUNTERS;
result.counters = counters;
}
use of org.apache.hadoop.mapreduce.Counters in project hadoop by apache.
the class JobImpl method getAllCounters.
@Override
public Counters getAllCounters() {
readLock.lock();
try {
JobStateInternal state = getInternalState();
if (state == JobStateInternal.ERROR || state == JobStateInternal.FAILED || state == JobStateInternal.KILLED || state == JobStateInternal.SUCCEEDED) {
this.mayBeConstructFinalFullCounters();
return fullCounters;
}
Counters counters = new Counters();
counters.incrAllCounters(jobCounters);
return incrTaskCounters(counters, tasks.values());
} finally {
readLock.unlock();
}
}
use of org.apache.hadoop.mapreduce.Counters in project hadoop by apache.
the class TaskAttemptImpl method updateProgressSplits.
private void updateProgressSplits() {
double newProgress = reportedStatus.progress;
newProgress = Math.max(Math.min(newProgress, 1.0D), 0.0D);
Counters counters = reportedStatus.counters;
if (counters == null)
return;
WrappedProgressSplitsBlock splitsBlock = getProgressSplitBlock();
if (splitsBlock != null) {
long now = clock.getTime();
// TODO Ensure not 0
long start = getLaunchTime();
if (start != 0 && now - start <= Integer.MAX_VALUE) {
splitsBlock.getProgressWallclockTime().extend(newProgress, (int) (now - start));
}
Counter cpuCounter = counters.findCounter(TaskCounter.CPU_MILLISECONDS);
if (cpuCounter != null && cpuCounter.getValue() <= Integer.MAX_VALUE) {
splitsBlock.getProgressCPUTime().extend(newProgress, // long to int? TODO: FIX. Same below
(int) cpuCounter.getValue());
}
Counter virtualBytes = counters.findCounter(TaskCounter.VIRTUAL_MEMORY_BYTES);
if (virtualBytes != null) {
splitsBlock.getProgressVirtualMemoryKbytes().extend(newProgress, (int) (virtualBytes.getValue() / (MEMORY_SPLITS_RESOLUTION)));
}
Counter physicalBytes = counters.findCounter(TaskCounter.PHYSICAL_MEMORY_BYTES);
if (physicalBytes != null) {
splitsBlock.getProgressPhysicalMemoryKbytes().extend(newProgress, (int) (physicalBytes.getValue() / (MEMORY_SPLITS_RESOLUTION)));
}
}
}
Aggregations