use of com.virjar.vscrawler.core.processor.GrabResult in project vscrawler by virjar.
the class VSCrawler method grabSync.
/**
* 同步执行抓取任务,适合booking场景,该抓取任务不入库,抓取结果不入pipeline,session创建不等待
*
* @param seed 任务种子
* @return 抓取结果
*/
public GrabResult grabSync(Seed seed) {
GrabResult crawlResult = new GrabResult();
try {
MDC.put("grabID", LogIdGenerator.genGrabTransactionID(vsCrawlerContext.getCrawlerName()));
VSCrawlerCommonUtil.setGrabStartTimeStampThreadLocal(System.currentTimeMillis());
// start component
if (!hasComponentInit) {
initComponentWithOutMainThread();
}
// set vsCrawlerContext into ThreadLocal ,for support event loop
VSCrawlerCommonUtil.setVSCrawlerContext(vsCrawlerContext);
// 30秒资源请求超时,防止线程阻塞
CrawlerSession session = crawlerSessionPool.borrowOne(VSCrawlerCommonUtil.grabTaskLessTime(), true);
if (session == null) {
VSCrawlerMonitor.recordOne(vsCrawlerContext.getCrawlerName() + "_borrowSession_failed");
crawlResult.setGrabSuccess(false);
crawlResult.setErrorMessage("can not allocate session resource from session pop");
return crawlResult;
}
try {
seed.setStatus(Seed.STATUS_RUNNING);
VSCrawlerCommonUtil.setCrawlerSession(session);
seedProcessor.process(seed, session, crawlResult);
return crawlResult;
} catch (Exception e) {
log.error("error when grab seed:{}", JSONViewWrapper.wrap(seed), e);
throw e;
} finally {
// 归还一个session,session有并发控制,feedback之后session才能被其他任务复用
VSCrawlerCommonUtil.clearCrawlerSession();
crawlerSessionPool.recycle(session);
try {
MDC.remove("grabID");
} catch (Exception e) {
// this exception is unimportant
log.error("failed to remove MDC variable", e);
}
}
} finally {
Long grabStartTimeStampThreadLocal = VSCrawlerCommonUtil.getGrabStartTimeStampThreadLocal();
VSCrawlerMonitor.recordOne(vsCrawlerContext.getCrawlerName() + "_grab", grabStartTimeStampThreadLocal);
VSCrawlerMonitor.recordOne(vsCrawlerContext.getCrawlerName() + "_grab_result_" + (crawlResult.isGrabSuccess() ? "success" : "failed"));
VSCrawlerCommonUtil.clearGrabTimeOutControl();
}
}
use of com.virjar.vscrawler.core.processor.GrabResult in project vscrawler by virjar.
the class BeautyCrawler method main.
public static void main(String[] args) throws IOException {
VSCrawler vsCrawler = VSCrawlerBuilder.create().setCrawlerName("beautyCrawler").setProcessor(new SeedProcessor() {
private void handlePic(Seed seed, CrawlerSession crawlerSession) {
Header[] headers = HeaderBuilder.create().withRefer(seed.getExt().get("refer")).defaultCommonHeader().buildArray();
byte[] entity = crawlerSession.getCrawlerHttpClient().getEntity(seed.getData(), headers);
if (entity == null) {
seed.retry();
return;
}
try {
// 文件根据网站,路径,base自动计算
Files.write(// 文件根据网站,路径,base自动计算
entity, new File(PathResolver.sourceToUnderLine("~/Desktop/testpic", seed.getData())));
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void process(final Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
if (StringUtils.endsWithIgnoreCase(seed.getData(), ".jpg")) {
handlePic(seed, crawlerSession);
} else {
String s = crawlerSession.getCrawlerHttpClient().get(seed.getData());
if (s == null) {
seed.retry();
return;
}
// 将下一页的链接和图片链接抽取出来
crawlResult.addSeeds(Lists.newArrayList(Iterables.transform(XpathParser.compileNoError("/css('#pages a')::self()[contains(text(),'下一页')]/absUrl('href') | /css('.content')::center/img/@src").evaluateToString(Jsoup.parse(s, seed.getData())), new Function<String, Seed>() {
@Override
public Seed apply(String input) {
Seed ret = new Seed(input);
if (StringUtils.endsWith(input, ".jpg")) {
ret.getExt().put("refer", seed.getData());
}
return ret;
}
})));
}
}
}).setWorkerThreadNumber(15).setSessionPoolCoreSize(20).setSessionPoolMaxSize(25).build();
// 清空历史爬去数据,或者会断点续爬
vsCrawler.clearTask();
vsCrawler.pushSeed("https://www.meitulu.com/item/2125.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/6892.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2124.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2120.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2086.html");
// 开始爬虫
vsCrawler.start();
}
use of com.virjar.vscrawler.core.processor.GrabResult in project vscrawler by virjar.
the class FutureCrawler method main.
public static void main(String[] args) {
VSCrawler vsCrawler = VSCrawlerBuilder.create().setStopWhileTaskEmptyDuration(2000).setSegmentResolver(new SegmentResolver() {
@Override
public long resolveSegmentKey(long activeTime) {
// 按分钟分段,每隔一分钟重新抓取链接,这里只是为了测试,实际上不能设置这么短,建议按天分段
return new DateTime(activeTime).withSecondOfMinute(0).getMillis();
}
}).setProcessor(new SeedProcessor() {
@Override
public void process(Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
// 建立一个种子副本
Seed copy = seed.copy();
// 设置生效时间为两分钟后
copy.setActiveTimeStamp(DateTime.now().plusMinutes(1).getMillis());
// 返回新种子
crawlResult.addSeed(copy);
}
}).build();
// 当前所有demo都会清空task,否则不同爬虫的数据可能紊乱
vsCrawler.clearTask();
vsCrawler.pushSeed("https://www.meitulu.com/item/6892.htm");
vsCrawler.start();
}
use of com.virjar.vscrawler.core.processor.GrabResult in project vscrawler by virjar.
the class GrabController method grab.
@RequestMapping("/grab")
@ResponseBody
public WebJsonResponse<?> grab(@RequestBody GrabRequest grabRequestBean) {
try {
VSCrawler vsCrawler = crawlerManager.get(grabRequestBean.getCrawlerName());
if (vsCrawler == null) {
return ReturnUtil.failed("no crawler defined :" + grabRequestBean.getCrawlerName());
}
Seed seed = new Seed(JSONObject.toJSONString(grabRequestBean));
GrabResult crawlResult = vsCrawler.grabSync(seed);
List<Object> strings = crawlResult.allObjectResult();
if (strings.size() == 0 && seed.getRetry() > 0) {
return ReturnUtil.failed("timeOut", ReturnUtil.status_timeout);
} else {
return ReturnUtil.success(strings);
}
} catch (Exception e) {
return ReturnUtil.failed(e.getMessage());
}
}
Aggregations