use of com.geccocrawler.gecco.request.HttpGetRequest in project dq-easy-cloud by dq-open-cloud.
the class EcReptileUtils method intoScheduler.
/**
* <p>
* 将要爬取数据的url放到Scheduler中
* </p>
* <p>
* <pre>
* 所需参数示例及其说明
* 参数名称 : 示例值 : 说明 : 是否必须
* </pre>
*
* @param geccoEngine : GeccoEngine : 爬虫引擎
* @param dynamicBeanDTO : EcReptileDynamicBeanDTO : 爬虫动态bean数据传输对象
* @param dataDTO : EcReptileDataDTO : 爬虫数据的数据传输对象由业务系统传入
* @author daiqi
* @创建时间 2018年6月8日 下午3:07:08
*/
public static void intoScheduler(GeccoEngine geccoEngine, EcReptileDynamicBeanDTO dynamicBeanDTO, EcReptileDataDTO dataDTO) {
List<String> matchUrls = dynamicBeanDTO.getMatchUrlList();
if (EcCollectionsUtils.isEmpty(matchUrls)) {
throw new EcBaseBusinessException("设置异常", "待设置异常");
}
for (String matchUrl : matchUrls) {
String intoUrl = matchUrl;
if (EcCollectionsUtils.isEmpty(dataDTO.getUrlKeyValueDTOs())) {
geccoEngine.getScheduler().into(new HttpGetRequest(intoUrl));
continue;
}
for (EcReptileKeyValueDTO keyValueDTO : dataDTO.getUrlKeyValueDTOs()) {
intoUrl = UrlMatcher.replaceParams(intoUrl, keyValueDTO.getKey(), keyValueDTO.getValue());
}
geccoEngine.getScheduler().into(new HttpGetRequest(intoUrl));
}
}
use of com.geccocrawler.gecco.request.HttpGetRequest in project dq-easy-cloud by dq-open-cloud.
the class DynamicGeccoTest method testPageAndDetail.
@Test
public void testPageAndDetail() {
// http://tl.cyg.changyou.com/goods/char_detail?serial_num=201806121144334550
// 对应JDPrice类
DynamicGecco.html().gecco("http://tl.cyg.changyou.com/goods/char_detail?serial_num={serialNum}", "redisJsonPipeline").stringField("serialNum").requestParameter().build().stringField("service").csspath("div.goods-info ul.info-list li p.server-info.J-message").text().build().floatField("menpai").csspath("div#goods-detail.goods-detail div.tab-cont-item.role div.left.w323 div.role-show span.fn-other-menpai").text().build().register();
// itemInfo
Class<?> itemInfo = DynamicGecco.html().gecco("*").stringField("url").csspath("dt.title a").href(false).build().stringField("roleName").csspath("dt.title a span.name").text().build().stringField("service").csspath("dd.server-and-time span.server-info").text().build().stringField("price").csspath("div.item-opr p.price").text().build().register();
// 对应的分页信息
DynamicGecco.html().gecco("http://tl.cyg.changyou.com/goods/public?world_id=0&order_by=remaintime-desc&have_chosen=&page_num={pageNum}", "jdbcJsonPipeline").stringField("pageNum").requestParameter().build().requestField("request").request().build().listField("list", itemInfo).csspath("div.jGoodsList ul#J_good_list.pg-goods-list li.role-item").build().register();
HttpGetRequest start = new HttpGetRequest("http://tl.cyg.changyou.com/goods/public?world_id=0&order_by=remaintime-desc&have_chosen=&page_num=1");
// HttpGetRequest start = new HttpGetRequest("http://tl.cyg.changyou.com/goods/char_detail?serial_num=201806121144334550");
GeccoEngine.create().classpath("com.easy.cloud.core.reptile.common.pipeline").start(start).interval(2000).loop(false).run();
}
use of com.geccocrawler.gecco.request.HttpGetRequest in project dq-easy-cloud by dq-open-cloud.
the class EcReptileController method addReptileUrl.
@RequestMapping(value = "addReptileUrl")
public EcBaseServiceResult addReptileUrl(@RequestParam(name = "serialNum") String serialNum) {
// try {
// EcPipelineTest.queue.put((new EcReptileKeyValueDTO [] {new EcReptileKeyValueDTO("code", serialNum)}));
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
EcReptileKeyValueDTO reptileKeyValueDTO = new EcReptileKeyValueDTO("code", serialNum);
String tempUrl = EcReptileConstant.MATCH_URL_DETAIL;
tempUrl = tempUrl.replace(reptileKeyValueDTO.getKey(), reptileKeyValueDTO.getValue());
String fullUrl = tempUrl.replace("{", "").replace("}", "");
geccoEngine.getScheduler().into(new HttpGetRequest(fullUrl));
return EcBaseServiceResult.newInstanceOfSuccess();
}
Aggregations