Search in sources :

Example 1 with ResultItems

use of us.codecraft.webmagic.ResultItems in project webmagic by code4craft.

the class ConfigurablePageProcessorTest method test.

@Test
public void test() throws Exception {
    List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
    ExtractRule extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//title");
    extractRule.setFieldName("title");
    extractRules.add(extractRule);
    extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
    extractRule.setFieldName("star");
    extractRules.add(extractRule);
    ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules)).setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
    assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>");
    assertThat(resultItems.getAll()).containsEntry("star", " 86 ");
}
Also used : ResultItems(us.codecraft.webmagic.ResultItems) ArrayList(java.util.ArrayList) MockGithubDownloader(us.codecraft.webmagic.downloader.MockGithubDownloader) Test(org.junit.Test)

Example 2 with ResultItems

use of us.codecraft.webmagic.ResultItems in project webmagic by code4craft.

the class ScriptConsole method startSpider.

private static void startSpider(Params params) {
    ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
    pageProcessor.getSite().setSleepTime(params.getSleepTime());
    pageProcessor.getSite().setRetryTimes(3);
    pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404, 403, 500, 502));
    Spider spider = Spider.create(pageProcessor).thread(params.getThread());
    spider.clearPipeline().addPipeline(new Pipeline() {

        @Override
        public void process(ResultItems resultItems, Task task) {
        }
    });
    if (params.getUrls() == null || params.getUrls().size() == 0) {
        System.err.println("Need at least one argument");
        System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
        System.exit(-1);
    }
    for (String url : params.getUrls()) {
        spider.addUrl(url);
    }
    spider.run();
}
Also used : Task(us.codecraft.webmagic.Task) ResultItems(us.codecraft.webmagic.ResultItems) Spider(us.codecraft.webmagic.Spider) Pipeline(us.codecraft.webmagic.pipeline.Pipeline)

Example 3 with ResultItems

use of us.codecraft.webmagic.ResultItems in project vscrawler by virjar.

the class WebMagicProcessorDelegator method parse.

@Override
protected void parse(Seed seed, String result, GrabResult crawlResult) {
    if (result == null) {
        seed.retry();
        return;
    }
    SipSoupPage sipSoupPage = new SipSoupPage();
    sipSoupPage.setRawText(result);
    sipSoupPage.setUrl(new PlainText(seed.getData()));
    sipSoupPage.setRequest(CovertUtil.convertSeed(seed));
    sipSoupPage.setStatusCode(200);
    pageProcessor.process(sipSoupPage);
    // new url
    List<Request> targetRequests = sipSoupPage.getTargetRequests();
    for (Request request : targetRequests) {
        crawlResult.addSeed(CovertUtil.covertRequest(request));
    }
    if (!sipSoupPage.getResultItems().isSkip()) {
        ResultItems resultItems = sipSoupPage.getResultItems();
        crawlResult.addResult(JSONObject.toJSONString(resultItems.getAll()));
    }
}
Also used : ResultItems(us.codecraft.webmagic.ResultItems) PlainText(us.codecraft.webmagic.selector.PlainText) Request(us.codecraft.webmagic.Request)

Example 4 with ResultItems

use of us.codecraft.webmagic.ResultItems in project yyl_example by Relucent.

the class SpiderTest method main.

public static void main(String[] args) {
    final Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
    Spider spider = Spider.create(new PageProcessor() {

        @Override
        public void process(Page page) {
            page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1", "text").toString());
            page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
        }

        @Override
        public Site getSite() {
            return site;
        }
    }).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate, "石墨烯"));
    list.add(String.format(urlTemplate, "气凝胶"));
    list.add(String.format(urlTemplate, "液态金属"));
    list.add(String.format(urlTemplate, "生物塑料"));
    list.add(String.format(urlTemplate, "形状记忆合金"));
    list.add(String.format(urlTemplate, "纳米纤维"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
        System.out.println(resultItemse.getAll());
    }
    spider.close();
}
Also used : Site(us.codecraft.webmagic.Site) PageProcessor(us.codecraft.webmagic.processor.PageProcessor) BaiduBaikePageProcessor(us.codecraft.webmagic.processor.example.BaiduBaikePageProcessor) ResultItems(us.codecraft.webmagic.ResultItems) Spider(us.codecraft.webmagic.Spider) ArrayList(java.util.ArrayList) Page(us.codecraft.webmagic.Page)

Example 5 with ResultItems

use of us.codecraft.webmagic.ResultItems in project webmagic by code4craft.

the class FilePipelineTest method before.

@BeforeClass
public static void before() {
    resultItems = new ResultItems();
    resultItems.put("content", "webmagic 爬虫工具");
    Request request = new Request("http://www.baidu.com");
    resultItems.setRequest(request);
    task = new Task() {

        @Override
        public String getUUID() {
            return UUID.randomUUID().toString();
        }

        @Override
        public Site getSite() {
            return null;
        }
    };
}
Also used : Site(us.codecraft.webmagic.Site) Task(us.codecraft.webmagic.Task) ResultItems(us.codecraft.webmagic.ResultItems) Request(us.codecraft.webmagic.Request) BeforeClass(org.junit.BeforeClass)

Aggregations

ResultItems (us.codecraft.webmagic.ResultItems)8 ArrayList (java.util.ArrayList)3 Spider (us.codecraft.webmagic.Spider)3 Request (us.codecraft.webmagic.Request)2 Site (us.codecraft.webmagic.Site)2 Task (us.codecraft.webmagic.Task)2 JSONObject (com.alibaba.fastjson.JSONObject)1 BeforeClass (org.junit.BeforeClass)1 Test (org.junit.Test)1 Page (us.codecraft.webmagic.Page)1 MockGithubDownloader (us.codecraft.webmagic.downloader.MockGithubDownloader)1 PhantomJSDownloader (us.codecraft.webmagic.downloader.PhantomJSDownloader)1 Pipeline (us.codecraft.webmagic.pipeline.Pipeline)1 ResultItemsCollectorPipeline (us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline)1 PageProcessor (us.codecraft.webmagic.processor.PageProcessor)1 BaiduBaikePageProcessor (us.codecraft.webmagic.processor.example.BaiduBaikePageProcessor)1 PlainText (us.codecraft.webmagic.selector.PlainText)1