use of us.codecraft.webmagic.ResultItems in project webmagic by code4craft.
the class ConfigurablePageProcessorTest method test.
@Test
public void test() throws Exception {
List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
ExtractRule extractRule = new ExtractRule();
extractRule.setExpressionType(ExpressionType.XPath);
extractRule.setExpressionValue("//title");
extractRule.setFieldName("title");
extractRules.add(extractRule);
extractRule = new ExtractRule();
extractRule.setExpressionType(ExpressionType.XPath);
extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
extractRule.setFieldName("star");
extractRules.add(extractRule);
ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules)).setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>");
assertThat(resultItems.getAll()).containsEntry("star", " 86 ");
}
use of us.codecraft.webmagic.ResultItems in project webmagic by code4craft.
the class ScriptConsole method startSpider.
private static void startSpider(Params params) {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setRetryTimes(3);
pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404, 403, 500, 502));
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
}
});
if (params.getUrls() == null || params.getUrls().size() == 0) {
System.err.println("Need at least one argument");
System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
System.exit(-1);
}
for (String url : params.getUrls()) {
spider.addUrl(url);
}
spider.run();
}
use of us.codecraft.webmagic.ResultItems in project vscrawler by virjar.
the class WebMagicProcessorDelegator method parse.
@Override
protected void parse(Seed seed, String result, GrabResult crawlResult) {
if (result == null) {
seed.retry();
return;
}
SipSoupPage sipSoupPage = new SipSoupPage();
sipSoupPage.setRawText(result);
sipSoupPage.setUrl(new PlainText(seed.getData()));
sipSoupPage.setRequest(CovertUtil.convertSeed(seed));
sipSoupPage.setStatusCode(200);
pageProcessor.process(sipSoupPage);
// new url
List<Request> targetRequests = sipSoupPage.getTargetRequests();
for (Request request : targetRequests) {
crawlResult.addSeed(CovertUtil.covertRequest(request));
}
if (!sipSoupPage.getResultItems().isSkip()) {
ResultItems resultItems = sipSoupPage.getResultItems();
crawlResult.addResult(JSONObject.toJSONString(resultItems.getAll()));
}
}
use of us.codecraft.webmagic.ResultItems in project yyl_example by Relucent.
the class SpiderTest method main.
public static void main(String[] args) {
final Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
Spider spider = Spider.create(new PageProcessor() {
@Override
public void process(Page page) {
page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1", "text").toString());
page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}
@Override
public Site getSite() {
return site;
}
}).thread(2);
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate, "石墨烯"));
list.add(String.format(urlTemplate, "气凝胶"));
list.add(String.format(urlTemplate, "液态金属"));
list.add(String.format(urlTemplate, "生物塑料"));
list.add(String.format(urlTemplate, "形状记忆合金"));
list.add(String.format(urlTemplate, "纳米纤维"));
List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}
use of us.codecraft.webmagic.ResultItems in project webmagic by code4craft.
the class FilePipelineTest method before.
@BeforeClass
public static void before() {
resultItems = new ResultItems();
resultItems.put("content", "webmagic 爬虫工具");
Request request = new Request("http://www.baidu.com");
resultItems.setRequest(request);
task = new Task() {
@Override
public String getUUID() {
return UUID.randomUUID().toString();
}
@Override
public Site getSite() {
return null;
}
};
}
Aggregations