use of us.codecraft.webmagic.processor.PageProcessor in project webmagic by code4craft.
the class SpiderTest method testRound.
private void testRound() {
Spider spider = Spider.create(new PageProcessor() {
private AtomicInteger count = new AtomicInteger();
@Override
public void process(Page page) {
page.setSkip(true);
}
@Override
public Site getSite() {
return Site.me().setSleepTime(0);
}
}).setDownloader(new Downloader() {
@Override
public Page download(Request request, Task task) {
return new Page().setRawText("");
}
@Override
public void setThread(int threadNum) {
}
}).setScheduler(new Scheduler() {
private AtomicInteger count = new AtomicInteger();
private Random random = new Random();
@Override
public void push(Request request, Task task) {
}
@Override
public synchronized Request poll(Task task) {
if (count.incrementAndGet() > 1000) {
return null;
}
if (random.nextInt(100) > 90) {
return null;
}
return new Request("test");
}
}).thread(10);
spider.run();
}
use of us.codecraft.webmagic.processor.PageProcessor in project yyl_example by Relucent.
the class SpiderTest method main.
public static void main(String[] args) {
final Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
Spider spider = Spider.create(new PageProcessor() {
@Override
public void process(Page page) {
page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1", "text").toString());
page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}
@Override
public Site getSite() {
return site;
}
}).thread(2);
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate, "石墨烯"));
list.add(String.format(urlTemplate, "气凝胶"));
list.add(String.format(urlTemplate, "液态金属"));
list.add(String.format(urlTemplate, "生物塑料"));
list.add(String.format(urlTemplate, "形状记忆合金"));
list.add(String.format(urlTemplate, "纳米纤维"));
List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}
Aggregations