use of com.virjar.vscrawler.core.processor.SeedProcessor in project vscrawler by virjar.
the class BeautyCrawler method main.
public static void main(String[] args) throws IOException {
VSCrawler vsCrawler = VSCrawlerBuilder.create().setCrawlerName("beautyCrawler").setProcessor(new SeedProcessor() {
private void handlePic(Seed seed, CrawlerSession crawlerSession) {
Header[] headers = HeaderBuilder.create().withRefer(seed.getExt().get("refer")).defaultCommonHeader().buildArray();
byte[] entity = crawlerSession.getCrawlerHttpClient().getEntity(seed.getData(), headers);
if (entity == null) {
seed.retry();
return;
}
try {
// 文件根据网站,路径,base自动计算
Files.write(// 文件根据网站,路径,base自动计算
entity, new File(PathResolver.sourceToUnderLine("~/Desktop/testpic", seed.getData())));
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void process(final Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
if (StringUtils.endsWithIgnoreCase(seed.getData(), ".jpg")) {
handlePic(seed, crawlerSession);
} else {
String s = crawlerSession.getCrawlerHttpClient().get(seed.getData());
if (s == null) {
seed.retry();
return;
}
// 将下一页的链接和图片链接抽取出来
crawlResult.addSeeds(Lists.newArrayList(Iterables.transform(XpathParser.compileNoError("/css('#pages a')::self()[contains(text(),'下一页')]/absUrl('href') | /css('.content')::center/img/@src").evaluateToString(Jsoup.parse(s, seed.getData())), new Function<String, Seed>() {
@Override
public Seed apply(String input) {
Seed ret = new Seed(input);
if (StringUtils.endsWith(input, ".jpg")) {
ret.getExt().put("refer", seed.getData());
}
return ret;
}
})));
}
}
}).setWorkerThreadNumber(15).setSessionPoolCoreSize(20).setSessionPoolMaxSize(25).build();
// 清空历史爬去数据,或者会断点续爬
vsCrawler.clearTask();
vsCrawler.pushSeed("https://www.meitulu.com/item/2125.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/6892.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2124.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2120.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2086.html");
// 开始爬虫
vsCrawler.start();
}
use of com.virjar.vscrawler.core.processor.SeedProcessor in project vscrawler by virjar.
the class FutureCrawler method main.
public static void main(String[] args) {
VSCrawler vsCrawler = VSCrawlerBuilder.create().setStopWhileTaskEmptyDuration(2000).setSegmentResolver(new SegmentResolver() {
@Override
public long resolveSegmentKey(long activeTime) {
// 按分钟分段,每隔一分钟重新抓取链接,这里只是为了测试,实际上不能设置这么短,建议按天分段
return new DateTime(activeTime).withSecondOfMinute(0).getMillis();
}
}).setProcessor(new SeedProcessor() {
@Override
public void process(Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
// 建立一个种子副本
Seed copy = seed.copy();
// 设置生效时间为两分钟后
copy.setActiveTimeStamp(DateTime.now().plusMinutes(1).getMillis());
// 返回新种子
crawlResult.addSeed(copy);
}
}).build();
// 当前所有demo都会清空task,否则不同爬虫的数据可能紊乱
vsCrawler.clearTask();
vsCrawler.pushSeed("https://www.meitulu.com/item/6892.htm");
vsCrawler.start();
}
Aggregations