use of com.virjar.vscrawler.core.VSCrawler in project vscrawler by virjar.
the class BeautyCrawler method main.
public static void main(String[] args) throws IOException {
VSCrawler vsCrawler = VSCrawlerBuilder.create().setCrawlerName("beautyCrawler").setProcessor(new SeedProcessor() {
private void handlePic(Seed seed, CrawlerSession crawlerSession) {
Header[] headers = HeaderBuilder.create().withRefer(seed.getExt().get("refer")).defaultCommonHeader().buildArray();
byte[] entity = crawlerSession.getCrawlerHttpClient().getEntity(seed.getData(), headers);
if (entity == null) {
seed.retry();
return;
}
try {
// 文件根据网站,路径,base自动计算
Files.write(// 文件根据网站,路径,base自动计算
entity, new File(PathResolver.sourceToUnderLine("~/Desktop/testpic", seed.getData())));
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void process(final Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
if (StringUtils.endsWithIgnoreCase(seed.getData(), ".jpg")) {
handlePic(seed, crawlerSession);
} else {
String s = crawlerSession.getCrawlerHttpClient().get(seed.getData());
if (s == null) {
seed.retry();
return;
}
// 将下一页的链接和图片链接抽取出来
crawlResult.addSeeds(Lists.newArrayList(Iterables.transform(XpathParser.compileNoError("/css('#pages a')::self()[contains(text(),'下一页')]/absUrl('href') | /css('.content')::center/img/@src").evaluateToString(Jsoup.parse(s, seed.getData())), new Function<String, Seed>() {
@Override
public Seed apply(String input) {
Seed ret = new Seed(input);
if (StringUtils.endsWith(input, ".jpg")) {
ret.getExt().put("refer", seed.getData());
}
return ret;
}
})));
}
}
}).setWorkerThreadNumber(15).setSessionPoolCoreSize(20).setSessionPoolMaxSize(25).build();
// 清空历史爬去数据,或者会断点续爬
vsCrawler.clearTask();
vsCrawler.pushSeed("https://www.meitulu.com/item/2125.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/6892.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2124.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2120.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2086.html");
// 开始爬虫
vsCrawler.start();
}
use of com.virjar.vscrawler.core.VSCrawler in project vscrawler by virjar.
the class SimpleCrawler method main.
public static void main(String[] args) {
// 启动爬虫
VSCrawler vsCrawler = VSCrawlerBuilder.create().build();
vsCrawler.clearTask();
vsCrawler.start();
// 增加种子
System.out.println("注入一个种子任务");
vsCrawler.pushSeed("https://www.hapag-lloyd.cn/zh/online-business/tracing/tracing-by-booking.html?booking=45119286");
// CommonUtil.sleep(100000);
}
use of com.virjar.vscrawler.core.VSCrawler in project vscrawler by virjar.
the class MeituCrawler method main.
public static void main(String[] args) {
VSCrawler vsCrawler = VSCrawlerBuilder.create().setWorkerThreadNumber(10).setCrawlerName("beautyCrawler_Annotation").setProcessor(AnnotationProcessorBuilder.create().addBeanPackage("com.virjar.vscrawler.samples.processor.meitu").build()).build();
vsCrawler.start();
vsCrawler.clearTask();
vsCrawler.pushSeed("https://www.meitulu.com/item/2125.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/6892.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2124.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2120.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2086.html");
}
use of com.virjar.vscrawler.core.VSCrawler in project vscrawler by virjar.
the class CrawlerController method stop.
@RequestMapping("/stopCrawler")
@ResponseBody
public WebJsonResponse<String> stop(@RequestParam("crawlerName") String crawlerName) {
VSCrawler vsCrawler = crawlerManager.get(crawlerName);
if (vsCrawler == null) {
return ReturnUtil.failed("not crawler defined");
}
vsCrawler.stopCrawler();
return ReturnUtil.success("success");
}
use of com.virjar.vscrawler.core.VSCrawler in project vscrawler by virjar.
the class FutureCrawler method main.
public static void main(String[] args) {
VSCrawler vsCrawler = VSCrawlerBuilder.create().setStopWhileTaskEmptyDuration(2000).setSegmentResolver(new SegmentResolver() {
@Override
public long resolveSegmentKey(long activeTime) {
// 按分钟分段,每隔一分钟重新抓取链接,这里只是为了测试,实际上不能设置这么短,建议按天分段
return new DateTime(activeTime).withSecondOfMinute(0).getMillis();
}
}).setProcessor(new SeedProcessor() {
@Override
public void process(Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
// 建立一个种子副本
Seed copy = seed.copy();
// 设置生效时间为两分钟后
copy.setActiveTimeStamp(DateTime.now().plusMinutes(1).getMillis());
// 返回新种子
crawlResult.addSeed(copy);
}
}).build();
// 当前所有demo都会清空task,否则不同爬虫的数据可能紊乱
vsCrawler.clearTask();
vsCrawler.pushSeed("https://www.meitulu.com/item/6892.htm");
vsCrawler.start();
}
Aggregations