Search in sources :

Example 6 with Seed

use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.

the class GrabController method grab.

@RequestMapping("/grab")
@ResponseBody
public WebJsonResponse<?> grab(@RequestBody GrabRequest grabRequestBean) {
    try {
        VSCrawler vsCrawler = crawlerManager.get(grabRequestBean.getCrawlerName());
        if (vsCrawler == null) {
            return ReturnUtil.failed("no crawler defined :" + grabRequestBean.getCrawlerName());
        }
        Seed seed = new Seed(JSONObject.toJSONString(grabRequestBean));
        GrabResult crawlResult = vsCrawler.grabSync(seed);
        List<Object> strings = crawlResult.allObjectResult();
        if (strings.size() == 0 && seed.getRetry() > 0) {
            return ReturnUtil.failed("timeOut", ReturnUtil.status_timeout);
        } else {
            return ReturnUtil.success(strings);
        }
    } catch (Exception e) {
        return ReturnUtil.failed(e.getMessage());
    }
}
Also used : VSCrawler(com.virjar.vscrawler.core.VSCrawler) Seed(com.virjar.vscrawler.core.seed.Seed) GrabResult(com.virjar.vscrawler.core.processor.GrabResult) JSONObject(com.alibaba.fastjson.JSONObject) RequestMapping(org.springframework.web.bind.annotation.RequestMapping) ResponseBody(org.springframework.web.bind.annotation.ResponseBody)

Example 7 with Seed

use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.

the class AnnotationProcessorBuilder method judgeMatchStrategy.

private AnnotationSeedProcessor.MatchStrategy judgeMatchStrategy(Class<? extends AbstractAutoProcessModel> aClass) {
    final AutoProcessor autoProcessor = aClass.getAnnotation(AutoProcessor.class);
    if (autoProcessor == null) {
        return null;
    }
    String seedPattern = autoProcessor.seedPattern();
    if (StringUtils.isNotBlank(seedPattern)) {
        try {
            final Pattern pattern = Pattern.compile(seedPattern);
            return new AnnotationSeedProcessor.MatchStrategy() {

                @Override
                public boolean matchSeed(Seed seed) {
                    return pattern.matcher(seed.getData()).matches();
                }

                @Override
                public int priority() {
                    return autoProcessor.priority();
                }
            };
        } catch (PatternSyntaxException e) {
            throw new IllegalStateException("error when register processor for class" + aClass.getName() + " regex error for seedPattern", e);
        }
    }
    Method[] methods = aClass.getMethods();
    for (final Method method : methods) {
        if (method.getAnnotation(MatchSeed.class) == null) {
            continue;
        }
        Preconditions.checkArgument(Boolean.class.isAssignableFrom(method.getReturnType()));
        Preconditions.checkArgument(Modifier.isStatic(method.getModifiers()));
        return new AnnotationSeedProcessor.MatchStrategy() {

            @Override
            public boolean matchSeed(Seed seed) {
                try {
                    return (Boolean) method.invoke(null, seed);
                } catch (Exception e) {
                    throw new IllegalStateException("can not jude seed match method", e);
                }
            }

            @Override
            public int priority() {
                return autoProcessor.priority();
            }
        };
    }
    return null;
}
Also used : Pattern(java.util.regex.Pattern) MatchSeed(com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.MatchSeed) Seed(com.virjar.vscrawler.core.seed.Seed) Method(java.lang.reflect.Method) AutoProcessor(com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.AutoProcessor) MatchSeed(com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.MatchSeed) PatternSyntaxException(java.util.regex.PatternSyntaxException) PatternSyntaxException(java.util.regex.PatternSyntaxException)

Example 8 with Seed

use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.

the class FetchTaskProcessor method handleNewSeed.

@SuppressWarnings("unchecked")
private List<Seed> handleNewSeed(Object transformedObject, final String baseUrl) {
    // 新种子注入处理
    if (transformedObject instanceof String) {
        Seed seed = new Seed(transformedObject.toString());
        seed.getExt().put("fromUrl", baseUrl);
        return Lists.newArrayList(seed);
    } else if (transformedObject instanceof Seed) {
        Seed seed = (Seed) transformedObject;
        seed.getExt().put("fromUrl", baseUrl);
        return Lists.newArrayList(seed);
    } else if (transformedObject instanceof Collection) {
        int size = ((Collection) transformedObject).size();
        if (size <= 0) {
            return Collections.emptyList();
        }
        Object next = ((Collection) transformedObject).iterator().next();
        if (next instanceof String) {
            return Lists.newArrayList(Collections2.transform((Collection<Object>) transformedObject, new Function<Object, Seed>() {

                @Override
                public Seed apply(Object input) {
                    Seed seed = new Seed(input.toString());
                    seed.getExt().put("fromUrl", baseUrl);
                    return seed;
                }
            }));
        } else if (next instanceof Seed) {
            return Lists.newArrayList((Collections2.transform((Collection<Object>) transformedObject, new Function<Object, Seed>() {

                @Override
                public Seed apply(Object input) {
                    Seed seed = (Seed) input;
                    seed.getExt().put("fromUrl", baseUrl);
                    return seed;
                }
            })));
        }
        throw new IllegalStateException("unknown type for " + next.getClass().getName() + " to transfer to new Seed");
    }
    throw new IllegalStateException("unknown type for " + transformedObject.getClass().getName() + " to transfer to new Seed");
}
Also used : Function(com.google.common.base.Function) Seed(com.virjar.vscrawler.core.seed.Seed) Collection(java.util.Collection)

Example 9 with Seed

use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.

the class AnnotationSeedProcessor method judgeDownloader.

private void judgeDownloader(Class<? extends AbstractAutoProcessModel> aClass) {
    Method[] methods = aClass.getMethods();
    for (final Method method : methods) {
        if (method.getAnnotation(DownLoadMethod.class) == null) {
            continue;
        }
        Preconditions.checkArgument(String.class.isAssignableFrom(method.getReturnType()));
        Preconditions.checkArgument(method.getParameterTypes().length >= 2);
        Preconditions.checkArgument(method.getParameterTypes()[0].isAssignableFrom(Seed.class));
        Preconditions.checkArgument(method.getParameterTypes()[1].isAssignableFrom(CrawlerSession.class));
        downloader = new Downloader() {

            @Override
            public String download(Seed seed, AbstractAutoProcessModel model, CrawlerSession crawlerSession) {
                try {
                    return (String) method.invoke(model, seed, crawlerSession);
                } catch (Exception e) {
                    throw new RuntimeException("invoke download method :" + method.toGenericString() + " failed", e);
                }
            }
        };
        return;
    }
    downloader = new Downloader() {

        @Override
        public String download(Seed seed, AbstractAutoProcessModel model, CrawlerSession crawlerSession) {
            return crawlerSession.getCrawlerHttpClient().get(seed.getData());
        }
    };
}
Also used : Seed(com.virjar.vscrawler.core.seed.Seed) DownLoadMethod(com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.DownLoadMethod) Method(java.lang.reflect.Method) CrawlerSession(com.virjar.vscrawler.core.net.session.CrawlerSession) DownLoadMethod(com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.DownLoadMethod)

Example 10 with Seed

use of com.virjar.vscrawler.core.seed.Seed in project vscrawler by virjar.

the class ModelExtractor method process.

@SuppressWarnings("unchecked")
public void process(Seed seed, String content, GrabResult crawlResult, AbstractAutoProcessModel model, AbstractSelectable abstractSelectable, boolean save) {
    String url = model.getBaseUrl();
    if (StringUtils.isBlank(url)) {
        try {
            new URI(seed.getData());
            url = seed.getData();
            model.setBaseUrl(url);
        } catch (Exception e) {
        // ignore
        }
    }
    AbstractSelectable baseSelectable = abstractSelectable;
    if (baseSelectable == null) {
        baseSelectable = AbstractSelectable.createModel(url, content);
    }
    Iterator<AbstractSelectable> iterator = rootSelector.select(baseSelectable).toMultiSelectable().iterator();
    if (!iterator.hasNext()) {
        return;
    }
    boolean hasRetry = false;
    // 支持单一网页抽取多个模型
    while (true) {
        AbstractSelectable next = iterator.next();
        model.setRawText(content);
        model.setOriginSelectable(next);
        model.setSeed(seed);
        model.setBaseUrl(url);
        if (!model.hasGrabSuccess()) {
            if (!hasRetry) {
                seed.retry();
                hasRetry = true;
            }
            continue;
        }
        model.beforeAutoFetch();
        List<Seed> newSeeds = fetchTaskProcessor.injectField(model, next, crawlResult);
        model.afterAutoFetch();
        newSeeds.addAll(model.newSeeds());
        crawlResult.addSeeds(newSeeds);
        if (save) {
            crawlResult.addResult(model);
        }
        if (iterator.hasNext()) {
            model = ObjectFactory.newInstance(aClass);
        } else {
            break;
        }
    }
}
Also used : Seed(com.virjar.vscrawler.core.seed.Seed) URI(java.net.URI) AbstractSelectable(com.virjar.vscrawler.core.selector.combine.AbstractSelectable)

Aggregations

Seed (com.virjar.vscrawler.core.seed.Seed)10 Function (com.google.common.base.Function)3 VSCrawler (com.virjar.vscrawler.core.VSCrawler)3 CrawlerSession (com.virjar.vscrawler.core.net.session.CrawlerSession)3 GrabResult (com.virjar.vscrawler.core.processor.GrabResult)3 JSONObject (com.alibaba.fastjson.JSONObject)2 SeedProcessor (com.virjar.vscrawler.core.processor.SeedProcessor)2 AbstractSelectable (com.virjar.vscrawler.core.selector.combine.AbstractSelectable)2 Method (java.lang.reflect.Method)2 AutoProcessor (com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.AutoProcessor)1 DownLoadMethod (com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.DownLoadMethod)1 MatchSeed (com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.MatchSeed)1 SegmentResolver (com.virjar.vscrawler.core.seed.SegmentResolver)1 File (java.io.File)1 IOException (java.io.IOException)1 Field (java.lang.reflect.Field)1 URI (java.net.URI)1 Collection (java.util.Collection)1 Pattern (java.util.regex.Pattern)1 PatternSyntaxException (java.util.regex.PatternSyntaxException)1