use of org.apache.hadoop.mapred.JobConf in project nutch-elasticsearch-indexer by ctjmorgan.
the class ElasticsearchIndexer method indexElasticsearch.
public void indexElasticsearch(String elasticsearchUrl, String elasticsearchPort, Path crawlDb, Path linkDb, List<Path> segments) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("ElasticsearchIndexer: starting at " + sdf.format(start));
final JobConf job = new NutchJob(getConf());
job.setJobName("index-elasticsearch " + elasticsearchUrl);
IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
job.set(ElasticsearchConstants.SERVER_URL, elasticsearchUrl);
job.set(ElasticsearchConstants.SERVER_PORT, elasticsearchPort);
NutchIndexWriterFactory.addClassToConf(job, ElasticsearchWriter.class);
job.setReduceSpeculativeExecution(false);
final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
FileOutputFormat.setOutputPath(job, tmp);
try {
// run the job and write the records to infinite (this will be done via the rest api
JobClient.runJob(job);
long end = System.currentTimeMillis();
LOG.info("ElasticsearchIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
} catch (Exception e) {
LOG.error(e);
} finally {
FileSystem.get(job).delete(tmp, true);
}
}
use of org.apache.hadoop.mapred.JobConf in project hadoop-book by elephantscale.
the class MultiFileWordCount method run.
public int run(String[] args) throws Exception {
if (args.length < 2) {
printUsage();
return 1;
}
JobConf job = new JobConf(getConf(), MultiFileWordCount.class);
job.setJobName("MultiFileWordCount");
//set the InputFormat of the job to our InputFormat
job.setInputFormat(MyInputFormat.class);
// the keys are words (strings)
job.setOutputKeyClass(Text.class);
// the values are counts (ints)
job.setOutputValueClass(LongWritable.class);
//use the defined mapper
job.setMapperClass(MapClass.class);
//use the WordCount Reducer
job.setCombinerClass(LongSumReducer.class);
job.setReducerClass(LongSumReducer.class);
FileInputFormat.addInputPaths(job, args[0]);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
JobClient.runJob(job);
return 0;
}
use of org.apache.hadoop.mapred.JobConf in project hadoop-book by elephantscale.
the class AggregateWordCount method main.
/**
* The main driver for word count map/reduce program. Invoke this method to
* submit the map/reduce job.
*
* @throws IOException When there is communication problems with the job
* tracker.
*/
@SuppressWarnings("unchecked")
public static void main(String[] args) throws IOException {
JobConf conf = ValueAggregatorJob.createValueAggregatorJob(args, new Class[] { WordCountPlugInClass.class });
JobClient.runJob(conf);
}
use of org.apache.hadoop.mapred.JobConf in project hadoop-book by elephantscale.
the class TeraGen method run.
/**
* @param args the cli arguments
*/
public int run(String[] args) throws IOException {
JobConf job = (JobConf) getConf();
setNumberOfRows(job, Long.parseLong(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setJobName("TeraGen");
job.setJarByClass(TeraGen.class);
job.setMapperClass(SortGenMapper.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormat(RangeInputFormat.class);
job.setOutputFormat(TeraOutputFormat.class);
JobClient.runJob(job);
return 0;
}
use of org.apache.hadoop.mapred.JobConf in project hadoop-book by elephantscale.
the class TeraValidate method main.
/**
* @param args
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new JobConf(), new TeraValidate(), args);
System.exit(res);
}
Aggregations