use of us.codecraft.webmagic.pipeline.Pipeline in project webmagic by code4craft.
the class ScriptConsole method startSpider.
private static void startSpider(Params params) {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setRetryTimes(3);
pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404, 403, 500, 502));
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
}
});
if (params.getUrls() == null || params.getUrls().size() == 0) {
System.err.println("Need at least one argument");
System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
System.exit(-1);
}
for (String url : params.getUrls()) {
spider.addUrl(url);
}
spider.run();
}
use of us.codecraft.webmagic.pipeline.Pipeline in project webmagic by code4craft.
the class Spider method processRequest.
protected void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getSleepTime());
onError(request);
return;
}
// for cycle retry
if (page.isNeedCycleRetry()) {
extractAndAddRequests(page, true);
sleep(site.getRetrySleepTime());
return;
}
pageProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
//for proxy status management
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime());
}
use of us.codecraft.webmagic.pipeline.Pipeline in project webmagic by code4craft.
the class Spider method onDownloadSuccess.
private void onDownloadSuccess(Request request, Page page) {
if (site.getAcceptStatCode().contains(page.getStatusCode())) {
pageProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
} else {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
return;
}
use of us.codecraft.webmagic.pipeline.Pipeline in project webmagic by code4craft.
the class SpiderTest method testStartAndStop.
@Ignore("long time")
@Test
public void testStartAndStop() throws InterruptedException {
Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/*")).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
System.out.println(1);
}
}).thread(1).addUrl("http://www.oschina.net/");
spider.start();
Thread.sleep(10000);
spider.stop();
Thread.sleep(10000);
spider.start();
Thread.sleep(10000);
}
use of us.codecraft.webmagic.pipeline.Pipeline in project webmagic by code4craft.
the class Spider method close.
public void close() {
destroyEach(downloader);
destroyEach(pageProcessor);
destroyEach(scheduler);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
threadPool.shutdown();
}
Aggregations