use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class GrabController method grab.
@RequestMapping("/grab")
@ResponseBody
public WebJsonResponse<?> grab(@RequestBody GrabRequest grabRequestBean) {
try {
VSCrawler vsCrawler = crawlerManager.get(grabRequestBean.getCrawlerName());
if (vsCrawler == null) {
return ReturnUtil.failed("no crawler defined :" + grabRequestBean.getCrawlerName());
}
Seed seed = new Seed(JSONObject.toJSONString(grabRequestBean));
GrabResult crawlResult = vsCrawler.grabSync(seed);
List<Object> strings = crawlResult.allObjectResult();
if (strings.size() == 0 && seed.getRetry() > 0) {
return ReturnUtil.failed("timeOut", ReturnUtil.status_timeout);
} else {
return ReturnUtil.success(strings);
}
} catch (Exception e) {
return ReturnUtil.failed(e.getMessage());
}
}
use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class AnnotationProcessorBuilder method judgeMatchStrategy.
private AnnotationSeedProcessor.MatchStrategy judgeMatchStrategy(Class<? extends AbstractAutoProcessModel> aClass) {
final AutoProcessor autoProcessor = aClass.getAnnotation(AutoProcessor.class);
if (autoProcessor == null) {
return null;
}
String seedPattern = autoProcessor.seedPattern();
if (StringUtils.isNotBlank(seedPattern)) {
try {
final Pattern pattern = Pattern.compile(seedPattern);
return new AnnotationSeedProcessor.MatchStrategy() {
@Override
public boolean matchSeed(Seed seed) {
return pattern.matcher(seed.getData()).matches();
}
@Override
public int priority() {
return autoProcessor.priority();
}
};
} catch (PatternSyntaxException e) {
throw new IllegalStateException("error when register processor for class" + aClass.getName() + " regex error for seedPattern", e);
}
}
Method[] methods = aClass.getMethods();
for (final Method method : methods) {
if (method.getAnnotation(MatchSeed.class) == null) {
continue;
}
Preconditions.checkArgument(Boolean.class.isAssignableFrom(method.getReturnType()));
Preconditions.checkArgument(Modifier.isStatic(method.getModifiers()));
return new AnnotationSeedProcessor.MatchStrategy() {
@Override
public boolean matchSeed(Seed seed) {
try {
return (Boolean) method.invoke(null, seed);
} catch (Exception e) {
throw new IllegalStateException("can not jude seed match method", e);
}
}
@Override
public int priority() {
return autoProcessor.priority();
}
};
}
return null;
}
use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class FetchTaskProcessor method handleNewSeed.
@SuppressWarnings("unchecked")
private List<Seed> handleNewSeed(Object transformedObject, final String baseUrl) {
// 新种子注入处理
if (transformedObject instanceof String) {
Seed seed = new Seed(transformedObject.toString());
seed.getExt().put("fromUrl", baseUrl);
return Lists.newArrayList(seed);
} else if (transformedObject instanceof Seed) {
Seed seed = (Seed) transformedObject;
seed.getExt().put("fromUrl", baseUrl);
return Lists.newArrayList(seed);
} else if (transformedObject instanceof Collection) {
int size = ((Collection) transformedObject).size();
if (size <= 0) {
return Collections.emptyList();
}
Object next = ((Collection) transformedObject).iterator().next();
if (next instanceof String) {
return Lists.newArrayList(Collections2.transform((Collection<Object>) transformedObject, new Function<Object, Seed>() {
@Override
public Seed apply(Object input) {
Seed seed = new Seed(input.toString());
seed.getExt().put("fromUrl", baseUrl);
return seed;
}
}));
} else if (next instanceof Seed) {
return Lists.newArrayList((Collections2.transform((Collection<Object>) transformedObject, new Function<Object, Seed>() {
@Override
public Seed apply(Object input) {
Seed seed = (Seed) input;
seed.getExt().put("fromUrl", baseUrl);
return seed;
}
})));
}
throw new IllegalStateException("unknown type for " + next.getClass().getName() + " to transfer to new Seed");
}
throw new IllegalStateException("unknown type for " + transformedObject.getClass().getName() + " to transfer to new Seed");
}
use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class AnnotationSeedProcessor method judgeDownloader.
private void judgeDownloader(Class<? extends AbstractAutoProcessModel> aClass) {
Method[] methods = aClass.getMethods();
for (final Method method : methods) {
if (method.getAnnotation(DownLoadMethod.class) == null) {
continue;
}
Preconditions.checkArgument(String.class.isAssignableFrom(method.getReturnType()));
Preconditions.checkArgument(method.getParameterTypes().length >= 2);
Preconditions.checkArgument(method.getParameterTypes()[0].isAssignableFrom(Seed.class));
Preconditions.checkArgument(method.getParameterTypes()[1].isAssignableFrom(CrawlerSession.class));
downloader = new Downloader() {
@Override
public String download(Seed seed, AbstractAutoProcessModel model, CrawlerSession crawlerSession) {
try {
return (String) method.invoke(model, seed, crawlerSession);
} catch (Exception e) {
throw new RuntimeException("invoke download method :" + method.toGenericString() + " failed", e);
}
}
};
return;
}
downloader = new Downloader() {
@Override
public String download(Seed seed, AbstractAutoProcessModel model, CrawlerSession crawlerSession) {
return crawlerSession.getCrawlerHttpClient().get(seed.getData());
}
};
}
use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.
the class ModelExtractor method process.
@SuppressWarnings("unchecked")
public void process(Seed seed, String content, GrabResult crawlResult, AbstractAutoProcessModel model, AbstractSelectable abstractSelectable, boolean save) {
String url = model.getBaseUrl();
if (StringUtils.isBlank(url)) {
try {
new URI(seed.getData());
url = seed.getData();
model.setBaseUrl(url);
} catch (Exception e) {
// ignore
}
}
AbstractSelectable baseSelectable = abstractSelectable;
if (baseSelectable == null) {
baseSelectable = AbstractSelectable.createModel(url, content);
}
Iterator<AbstractSelectable> iterator = rootSelector.select(baseSelectable).toMultiSelectable().iterator();
if (!iterator.hasNext()) {
return;
}
boolean hasRetry = false;
// 支持单一网页抽取多个模型
while (true) {
AbstractSelectable next = iterator.next();
model.setRawText(content);
model.setOriginSelectable(next);
model.setSeed(seed);
model.setBaseUrl(url);
if (!model.hasGrabSuccess()) {
if (!hasRetry) {
seed.retry();
hasRetry = true;
}
continue;
}
model.beforeAutoFetch();
List<Seed> newSeeds = fetchTaskProcessor.injectField(model, next, crawlResult);
model.afterAutoFetch();
newSeeds.addAll(model.newSeeds());
crawlResult.addSeeds(newSeeds);
if (save) {
crawlResult.addResult(model);
}
if (iterator.hasNext()) {
model = ObjectFactory.newInstance(aClass);
} else {
break;
}
}
}
Aggregations