use of com.virjar.vscrawler.core.selector.combine.AbstractSelectable in project vscrawler by virjar.
the class FetchTaskProcessor method injectField.
@SuppressWarnings("unchecked")
List<Seed> injectField(final AbstractAutoProcessModel model, AbstractSelectable abstractSelectable, final GrabResult crawlResult) {
List<Seed> newSeeds = Lists.newLinkedList();
for (final FetchTaskBean fetchTaskBean : fetchTaskBeanList) {
try {
Field field = fetchTaskBean.getField();
final Class<?> type = field.getType();
// 处理循环的model,支持子结构抽取
if (AbstractAutoProcessModel.class.isAssignableFrom(type) && annotationProcessorBuilder.findExtractor((Class<? extends AbstractAutoProcessModel>) type) != null) {
AbstractSelectable subSelectable = fetchTaskBean.getModelSelector().select(abstractSelectable);
field.set(model, fetchSubModel(type, subSelectable, model, crawlResult));
continue;
}
Object data = null;
if (Collection.class.isAssignableFrom(type) && fetchTaskBean.getHelpClazz() != Object.class && annotationProcessorBuilder.findExtractor(fetchTaskBean.getHelpClazz()) != null) {
List<AbstractSelectable> abstractSelectables = fetchTaskBean.getModelSelector().select(abstractSelectable).toMultiSelectable();
data = Lists.transform(abstractSelectables, new Function<AbstractSelectable, AbstractAutoProcessModel>() {
@Override
public AbstractAutoProcessModel apply(AbstractSelectable input) {
return fetchSubModel(fetchTaskBean.getHelpClazz(), input, model, crawlResult);
}
});
}
// 非子model抽取,需要直接抽取到结果,结束抽取链,判断抽取结果类型,进行数据类型转换操作
if (data == null) {
data = fetchTaskBean.getModelSelector().select(abstractSelectable).createOrGetModel();
}
// 特殊逻辑,因为SipNode对象同时持有字符串或者dom对象,所以需要对他进行拆箱
data = unPackSipNode(data);
// 如果目标类型不是集合或者数组,且源数据为集合,则进行集合拆箱
data = unpackCollection(type, data);
if (data == null) {
continue;
}
Object transformedObject = TypeCastUtils.cast(data, type, fetchTaskBean.getHelpClazz());
if (transformedObject == null) {
transformedObject = TypeUtils.cast(data, type, ParserConfig.getGlobalInstance());
}
field.set(model, transformedObject);
if (fetchTaskBean.isNewSeed()) {
newSeeds.addAll(handleNewSeed(transformedObject, model.getBaseUrl()));
}
} catch (Exception e) {
throw new RuntimeException("can not inject data for model:" + model.getClass().getName() + " for field: " + fetchTaskBean.getField().getName(), e);
}
}
return newSeeds;
}
use of com.virjar.vscrawler.core.selector.combine.AbstractSelectable in project vscrawler by virjar.
the class ChainSelector method main.
public static void main(String[] args) throws IOException {
AbstractSelectable selectable = AbstractSelectable.createModel("http://www.virjar.com", IOUtils.toString(HtmlJsonSelectorTest.class.getResourceAsStream("/select.html"), Charsets.UTF_8));
List<String> model = selectable.xpath("/css('#nationality')::option/text()").stringRule("self()").createOrGetModel();
for (String str : model) {
System.out.println(str);
}
}
use of com.virjar.vscrawler.core.selector.combine.AbstractSelectable in project vscrawler by virjar.
the class HtmlJsonSelectorTest method main.
public static void main(String[] args) throws IOException {
AbstractSelectable selectable = AbstractSelectable.createModel("http://www.virjar.com", IOUtils.toString(HtmlJsonSelectorTest.class.getResourceAsStream("/htmljson.html"), Charsets.UTF_8));
List<String> allDepartureDate = selectable.css("#testid pre").xpath("/text()").jsonPath("$.bookingVoyages[0:].bookingFlights[0:].departureDate").stringRule("deleteWhitespace(self())").createOrGetModel();
System.out.println(Joiner.on(",").join(allDepartureDate));
}
use of com.virjar.vscrawler.core.selector.combine.AbstractSelectable in project vscrawler by virjar.
the class JsonNode method toMultiSelectable.
@Override
public List<AbstractSelectable> toMultiSelectable() {
List<JSON> models = createOrGetModel();
List<AbstractSelectable> ret = Lists.newLinkedList();
for (final JSON json : models) {
JsonNode jsonNode = new JsonNode(getBaseUrl(), new RawTextStringFactory() {
@Override
public String rawText() {
return json.toJSONString();
}
});
jsonNode.setModel(Lists.newArrayList(json));
ret.add(jsonNode);
}
return ret;
}
use of com.virjar.vscrawler.core.selector.combine.AbstractSelectable in project vscrawler by virjar.
the class XpathNode method toMultiSelectable.
@Override
public List<AbstractSelectable> toMultiSelectable() {
SipNodes sipNodes = createOrGetModel();
List<AbstractSelectable> ret = Lists.newLinkedList();
for (final SIPNode sipNode : sipNodes) {
XpathNode xpathNode;
if (sipNode.isText()) {
xpathNode = new XpathNode(getBaseUrl(), sipNode.getTextVal());
} else {
xpathNode = new XpathNode(getBaseUrl(), new RawTextStringFactory() {
@Override
public String rawText() {
return sipNode.toString();
}
});
}
xpathNode.setModel(new SipNodes(sipNode));
ret.add(xpathNode);
}
return ret;
}
Aggregations