use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class VSCrawler method run.
@Override
public void run() {
checkRunningStat();
initComponent();
log.info("Spider started!");
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
Seed seed = berkeleyDBSeedManager.poll();
// 种子为空处理
if (seed == null) {
if (stat.get() == STAT_STOPPED) {
break;
}
vsCrawlerContext.getAutoEventRegistry().findEventDeclaring(SeedEmptyEvent.class).onSeedEmpty(vsCrawlerContext);
if (!waitDispatchThread()) {
log.warn("爬虫线程休眠被打断");
break;
}
continue;
}
lastActiveTime = System.currentTimeMillis();
// 执行抓取任务
threadPool.execute(new SeedProcessTask(seed));
// 当任务满的时候,暂时阻塞任务产生线程,直到有空闲线程资源
if (activeTasks.get() >= threadPool.getMaximumPoolSize()) {
if (!waitDispatchThread()) {
log.warn("爬虫线程休眠被打断");
break;
}
}
// 慢启动控制
if (slowStart && startTime.getTime() + slowStartDuration > System.currentTimeMillis()) {
slowStartThreadNumber++;
log.info("慢启动:{}", slowStartThreadNumber);
if (threadPool.getActiveCount() >= slowStartThreadNumber) {
// 如果线程数活跃线程数目大于或者等于慢启动控制数目,则暂定线程
CommonUtil.sleep(slowStartDuration / threadNumber);
}
}
}
if (!threadPool.isShutdown()) {
threadPool.shutdown();
try {
// 如果是主动停止爬虫,那么等待10分钟,等待爬虫任务执行结束
threadPool.awaitTermination(10, TimeUnit.MINUTES);
} catch (InterruptedException e) {
log.error("crawler shop wait failed");
}
}
// 直接在外部终止爬虫,这里可能调两次
stopCrawler();
log.info("爬虫结束");
crawlerMainThread = null;
stat.set(STAT_INIT);
}
use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class FetchTaskProcessor method injectField.
@SuppressWarnings("unchecked")
List<Seed> injectField(final AbstractAutoProcessModel model, AbstractSelectable abstractSelectable, final GrabResult crawlResult) {
List<Seed> newSeeds = Lists.newLinkedList();
for (final FetchTaskBean fetchTaskBean : fetchTaskBeanList) {
try {
Field field = fetchTaskBean.getField();
final Class<?> type = field.getType();
// 处理循环的model,支持子结构抽取
if (AbstractAutoProcessModel.class.isAssignableFrom(type) && annotationProcessorBuilder.findExtractor((Class<? extends AbstractAutoProcessModel>) type) != null) {
AbstractSelectable subSelectable = fetchTaskBean.getModelSelector().select(abstractSelectable);
field.set(model, fetchSubModel(type, subSelectable, model, crawlResult));
continue;
}
Object data = null;
if (Collection.class.isAssignableFrom(type) && fetchTaskBean.getHelpClazz() != Object.class && annotationProcessorBuilder.findExtractor(fetchTaskBean.getHelpClazz()) != null) {
List<AbstractSelectable> abstractSelectables = fetchTaskBean.getModelSelector().select(abstractSelectable).toMultiSelectable();
data = Lists.transform(abstractSelectables, new Function<AbstractSelectable, AbstractAutoProcessModel>() {
@Override
public AbstractAutoProcessModel apply(AbstractSelectable input) {
return fetchSubModel(fetchTaskBean.getHelpClazz(), input, model, crawlResult);
}
});
}
// 非子model抽取,需要直接抽取到结果,结束抽取链,判断抽取结果类型,进行数据类型转换操作
if (data == null) {
data = fetchTaskBean.getModelSelector().select(abstractSelectable).createOrGetModel();
}
// 特殊逻辑,因为SipNode对象同时持有字符串或者dom对象,所以需要对他进行拆箱
data = unPackSipNode(data);
// 如果目标类型不是集合或者数组,且源数据为集合,则进行集合拆箱
data = unpackCollection(type, data);
if (data == null) {
continue;
}
Object transformedObject = TypeCastUtils.cast(data, type, fetchTaskBean.getHelpClazz());
if (transformedObject == null) {
transformedObject = TypeUtils.cast(data, type, ParserConfig.getGlobalInstance());
}
field.set(model, transformedObject);
if (fetchTaskBean.isNewSeed()) {
newSeeds.addAll(handleNewSeed(transformedObject, model.getBaseUrl()));
}
} catch (Exception e) {
throw new RuntimeException("can not inject data for model:" + model.getClass().getName() + " for field: " + fetchTaskBean.getField().getName(), e);
}
}
return newSeeds;
}
use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class CovertUtil method covertRequest.
public static Seed covertRequest(Request request) {
if (StringUtils.isNotEmpty(request.getMethod()) && !StringUtils.equalsIgnoreCase(request.getMethod(), "get")) {
log.warn("vscrawler can not support webmagic get method,this request {} will be ignore", request.getUrl());
return null;
}
Seed seed = new Seed(request.getUrl());
seed.setExt(Maps.transformEntries(request.getExtras(), new Maps.EntryTransformer<String, Object, String>() {
@Override
public String transformEntry(String key, Object value) {
if (value instanceof String) {
return (String) value;
}
return JSONObject.toJSONString(value);
}
}));
return seed;
}
use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class BeautyCrawler method main.
public static void main(String[] args) throws IOException {
VSCrawler vsCrawler = VSCrawlerBuilder.create().setCrawlerName("beautyCrawler").setProcessor(new SeedProcessor() {
private void handlePic(Seed seed, CrawlerSession crawlerSession) {
Header[] headers = HeaderBuilder.create().withRefer(seed.getExt().get("refer")).defaultCommonHeader().buildArray();
byte[] entity = crawlerSession.getCrawlerHttpClient().getEntity(seed.getData(), headers);
if (entity == null) {
seed.retry();
return;
}
try {
// 文件根据网站,路径,base自动计算
Files.write(// 文件根据网站,路径,base自动计算
entity, new File(PathResolver.sourceToUnderLine("~/Desktop/testpic", seed.getData())));
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void process(final Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
if (StringUtils.endsWithIgnoreCase(seed.getData(), ".jpg")) {
handlePic(seed, crawlerSession);
} else {
String s = crawlerSession.getCrawlerHttpClient().get(seed.getData());
if (s == null) {
seed.retry();
return;
}
// 将下一页的链接和图片链接抽取出来
crawlResult.addSeeds(Lists.newArrayList(Iterables.transform(XpathParser.compileNoError("/css('#pages a')::self()[contains(text(),'下一页')]/absUrl('href') | /css('.content')::center/img/@src").evaluateToString(Jsoup.parse(s, seed.getData())), new Function<String, Seed>() {
@Override
public Seed apply(String input) {
Seed ret = new Seed(input);
if (StringUtils.endsWith(input, ".jpg")) {
ret.getExt().put("refer", seed.getData());
}
return ret;
}
})));
}
}
}).setWorkerThreadNumber(15).setSessionPoolCoreSize(20).setSessionPoolMaxSize(25).build();
// 清空历史爬去数据,或者会断点续爬
vsCrawler.clearTask();
vsCrawler.pushSeed("https://www.meitulu.com/item/2125.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/6892.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2124.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2120.html");
vsCrawler.pushSeed("https://www.meitulu.com/item/2086.html");
// 开始爬虫
vsCrawler.start();
}
use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class FutureCrawler method main.
public static void main(String[] args) {
VSCrawler vsCrawler = VSCrawlerBuilder.create().setStopWhileTaskEmptyDuration(2000).setSegmentResolver(new SegmentResolver() {
@Override
public long resolveSegmentKey(long activeTime) {
// 按分钟分段,每隔一分钟重新抓取链接,这里只是为了测试,实际上不能设置这么短,建议按天分段
return new DateTime(activeTime).withSecondOfMinute(0).getMillis();
}
}).setProcessor(new SeedProcessor() {
@Override
public void process(Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
// 建立一个种子副本
Seed copy = seed.copy();
// 设置生效时间为两分钟后
copy.setActiveTimeStamp(DateTime.now().plusMinutes(1).getMillis());
// 返回新种子
crawlResult.addSeed(copy);
}
}).build();
// 当前所有demo都会清空task,否则不同爬虫的数据可能紊乱
vsCrawler.clearTask();
vsCrawler.pushSeed("https://www.meitulu.com/item/6892.htm");
vsCrawler.start();
}
Aggregations