Search in sources :

Example 1 with Spider

use of us.codecraft.webmagic.Spider in project webmagic by code4craft.

the class ScriptConsole method startSpider.

private static void startSpider(Params params) {
    ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
    pageProcessor.getSite().setSleepTime(params.getSleepTime());
    pageProcessor.getSite().setRetryTimes(3);
    pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404, 403, 500, 502));
    Spider spider = Spider.create(pageProcessor).thread(params.getThread());
    spider.clearPipeline().addPipeline(new Pipeline() {

        @Override
        public void process(ResultItems resultItems, Task task) {
        }
    });
    if (params.getUrls() == null || params.getUrls().size() == 0) {
        System.err.println("Need at least one argument");
        System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
        System.exit(-1);
    }
    for (String url : params.getUrls()) {
        spider.addUrl(url);
    }
    spider.run();
}
Also used : Task(us.codecraft.webmagic.Task) ResultItems(us.codecraft.webmagic.ResultItems) Spider(us.codecraft.webmagic.Spider) Pipeline(us.codecraft.webmagic.pipeline.Pipeline)

Example 2 with Spider

use of us.codecraft.webmagic.Spider in project webmagic by code4craft.

the class Kr36NewsModel method main.

public static void main(String[] args) throws IOException, JMException {
    //Just for benchmark
    Spider thread = OOSpider.create(Site.me().addStartUrl("http://www.36kr.com/").setSleepTime(0), new PageModelPipeline() {

        @Override
        public void process(Object o, Task task) {
        }
    }, Kr36NewsModel.class).thread(20);
    thread.start();
    SpiderMonitor spiderMonitor = SpiderMonitor.instance();
    spiderMonitor.register(thread);
}
Also used : PageModelPipeline(us.codecraft.webmagic.pipeline.PageModelPipeline) Task(us.codecraft.webmagic.Task) SpiderMonitor(us.codecraft.webmagic.monitor.SpiderMonitor) Spider(us.codecraft.webmagic.Spider) OOSpider(us.codecraft.webmagic.model.OOSpider)

Example 3 with Spider

use of us.codecraft.webmagic.Spider in project webmagic by code4craft.

the class TempProcessor method testSeedUrlWithPort.

@Test
public void testSeedUrlWithPort() throws JMException {
    Spider spider = Spider.create(new TempProcessor()).addUrl("http://www.hndpf.org:8889/");
    SpiderMonitor.instance().register(spider);
    spider.run();
}
Also used : Spider(us.codecraft.webmagic.Spider) Test(org.junit.Test)

Example 4 with Spider

use of us.codecraft.webmagic.Spider in project webmagic by code4craft.

the class SpiderMonitorTest method testInherit.

@Test
public void testInherit() throws Exception {
    SpiderMonitor spiderMonitor = new SpiderMonitor() {

        @Override
        protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) {
            return new CustomSpiderStatus(spider, monitorSpiderListener);
        }
    };
    Spider zhihuSpider = Spider.create(new ZhihuPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").thread(2);
    Spider githubSpider = Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft");
    spiderMonitor.register(zhihuSpider, githubSpider);
}
Also used : GithubRepoPageProcessor(us.codecraft.webmagic.processor.example.GithubRepoPageProcessor) Spider(us.codecraft.webmagic.Spider) ZhihuPageProcessor(us.codecraft.webmagic.processor.example.ZhihuPageProcessor) Test(org.junit.Test)

Example 5 with Spider

use of us.codecraft.webmagic.Spider in project yyl_example by Relucent.

the class SpiderTest method main.

public static void main(String[] args) {
    final Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
    Spider spider = Spider.create(new PageProcessor() {

        @Override
        public void process(Page page) {
            page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1", "text").toString());
            page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
        }

        @Override
        public Site getSite() {
            return site;
        }
    }).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate, "石墨烯"));
    list.add(String.format(urlTemplate, "气凝胶"));
    list.add(String.format(urlTemplate, "液态金属"));
    list.add(String.format(urlTemplate, "生物塑料"));
    list.add(String.format(urlTemplate, "形状记忆合金"));
    list.add(String.format(urlTemplate, "纳米纤维"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
        System.out.println(resultItemse.getAll());
    }
    spider.close();
}
Also used : Site(us.codecraft.webmagic.Site) PageProcessor(us.codecraft.webmagic.processor.PageProcessor) BaiduBaikePageProcessor(us.codecraft.webmagic.processor.example.BaiduBaikePageProcessor) ResultItems(us.codecraft.webmagic.ResultItems) Spider(us.codecraft.webmagic.Spider) ArrayList(java.util.ArrayList) Page(us.codecraft.webmagic.Page)

Aggregations

Spider (us.codecraft.webmagic.Spider)10 ArrayList (java.util.ArrayList)3 ResultItems (us.codecraft.webmagic.ResultItems)3 Test (org.junit.Test)2 Task (us.codecraft.webmagic.Task)2 GithubRepoPageProcessor (us.codecraft.webmagic.processor.example.GithubRepoPageProcessor)2 ZhihuPageProcessor (us.codecraft.webmagic.processor.example.ZhihuPageProcessor)2 Page (us.codecraft.webmagic.Page)1 Site (us.codecraft.webmagic.Site)1 SpiderListener (us.codecraft.webmagic.SpiderListener)1 OOSpider (us.codecraft.webmagic.model.OOSpider)1 SpiderMonitor (us.codecraft.webmagic.monitor.SpiderMonitor)1 PageModelPipeline (us.codecraft.webmagic.pipeline.PageModelPipeline)1 Pipeline (us.codecraft.webmagic.pipeline.Pipeline)1 PageProcessor (us.codecraft.webmagic.processor.PageProcessor)1 BaiduBaikePageProcessor (us.codecraft.webmagic.processor.example.BaiduBaikePageProcessor)1 BloomFilterDuplicateRemover (us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover)1 PriorityScheduler (us.codecraft.webmagic.scheduler.PriorityScheduler)1 QueueScheduler (us.codecraft.webmagic.scheduler.QueueScheduler)1