use of us.codecraft.webmagic.Task in project webmagic by code4craft.
the class ScriptConsole method startSpider.
private static void startSpider(Params params) {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setRetryTimes(3);
pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404, 403, 500, 502));
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
}
});
if (params.getUrls() == null || params.getUrls().size() == 0) {
System.err.println("Need at least one argument");
System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
System.exit(-1);
}
for (String url : params.getUrls()) {
spider.addUrl(url);
}
spider.run();
}
use of us.codecraft.webmagic.Task in project webmagic by code4craft.
the class Kr36NewsModel method main.
public static void main(String[] args) throws IOException, JMException {
//Just for benchmark
Spider thread = OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() {
@Override
public void process(Object o, Task task) {
}
}, Kr36NewsModel.class).thread(20);
thread.start();
SpiderMonitor spiderMonitor = SpiderMonitor.instance();
spiderMonitor.register(thread);
}
use of us.codecraft.webmagic.Task in project webmagic by code4craft.
the class SeleniumDownloaderTest method test.
@Ignore("need chrome driver")
@Test
public void test() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
long time1 = System.currentTimeMillis();
for (int i = 0; i < 100; i++) {
Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
@Override
public String getUUID() {
return "huaban.com";
}
@Override
public Site getSite() {
return Site.me();
}
});
System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
}
System.out.println(System.currentTimeMillis() - time1);
}
use of us.codecraft.webmagic.Task in project webmagic by code4craft.
the class SeleniumDownloaderTest method testBaiduWenku.
@Ignore
@Test
public void testBaiduWenku() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
seleniumDownloader.setSleepTime(10000);
long time1 = System.currentTimeMillis();
Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
@Override
public String getUUID() {
return "huaban.com";
}
@Override
public Site getSite() {
return Site.me();
}
});
System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>", "").replace("&nsbp;", "").all());
}
use of us.codecraft.webmagic.Task in project webmagic by code4craft.
the class RedisPrioritySchedulerTest method test.
@Ignore("environment depended")
@Test
public void test() {
Task task = new Task() {
@Override
public String getUUID() {
return "TestTask";
}
@Override
public Site getSite() {
return null;
}
};
scheduler.resetDuplicateCheck(task);
Request request = new Request("https://www.google.com");
Request request1 = new Request("https://www.facebook.com/");
Request request2 = new Request("https://twitter.com");
request.setPriority(1).putExtra("name", "google");
request1.setPriority(0).putExtra("name", "facebook");
request2.setPriority(-1).putExtra("name", "twitter");
scheduler.push(request, task);
scheduler.push(request1, task);
scheduler.push(request2, task);
Request GRequest = scheduler.poll(task);
Request FBRequest = scheduler.poll(task);
Request TRequest = scheduler.poll(task);
Assert.assertEquals(GRequest.getUrl(), request.getUrl());
Assert.assertEquals(GRequest.getExtra("name"), request.getExtra("name"));
Assert.assertEquals(FBRequest.getUrl(), request1.getUrl());
Assert.assertEquals(FBRequest.getExtra("name"), request.getExtra("name"));
Assert.assertEquals(TRequest.getUrl(), request2.getUrl());
Assert.assertEquals(TRequest.getExtra("name"), request.getExtra("name"));
}
Aggregations