use of org.apache.nutch.util.NutchJob in project nutch-elasticsearch-indexer by ctjmorgan.
the class ElasticsearchIndexer method indexElasticsearch.
public void indexElasticsearch(String elasticsearchUrl, String elasticsearchPort, Path crawlDb, Path linkDb, List<Path> segments) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("ElasticsearchIndexer: starting at " + sdf.format(start));
final JobConf job = new NutchJob(getConf());
job.setJobName("index-elasticsearch " + elasticsearchUrl);
IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
job.set(ElasticsearchConstants.SERVER_URL, elasticsearchUrl);
job.set(ElasticsearchConstants.SERVER_PORT, elasticsearchPort);
NutchIndexWriterFactory.addClassToConf(job, ElasticsearchWriter.class);
job.setReduceSpeculativeExecution(false);
final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
FileOutputFormat.setOutputPath(job, tmp);
try {
// run the job and write the records to infinite (this will be done via the rest api
JobClient.runJob(job);
long end = System.currentTimeMillis();
LOG.info("ElasticsearchIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
} catch (Exception e) {
LOG.error(e);
} finally {
FileSystem.get(job).delete(tmp, true);
}
}
Aggregations