use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.
the class FessXpathTransformerTest method test_transform.
public void test_transform() throws Exception {
String data = "<html><head><title>Test</title></head><body><h1>Header1</h1><p>This is a pen.</p></body></html>";
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
SingletonLaContainerFactory.getContainer().register(CrawlingInfoHelper.class, "crawlingInfoHelper");
SingletonLaContainerFactory.getContainer().register(PathMappingHelper.class, "pathMappingHelper");
SingletonLaContainerFactory.getContainer().register(CrawlingConfigHelper.class, "crawlingConfigHelper");
SingletonLaContainerFactory.getContainer().register(SystemHelper.class, "systemHelper");
SingletonLaContainerFactory.getContainer().register(FileTypeHelper.class, "fileTypeHelper");
SingletonLaContainerFactory.getContainer().register(DocumentHelper.class, "documentHelper");
SingletonLaContainerFactory.getContainer().register(LabelTypeHelper.class, "labelTypeHelper");
WebConfig webConfig = new WebConfig();
setValueToObject(webConfig, "labelTypeList", new ArrayList<LabelType>());
ComponentUtil.getCrawlingConfigHelper().store("test", webConfig);
setValueToObject(ComponentUtil.getLabelTypeHelper(), "labelTypePatternList", new ArrayList<LabelTypePattern>());
for (int i = 0; i < 10000; i++) {
if (i % 1000 == 0) {
logger.info(MemoryUtil.getMemoryUsageLog() + ":" + i);
System.gc();
}
ResponseData responseData = new ResponseData();
responseData.setCharSet("UTF-8");
responseData.setContentLength(data.length());
responseData.setExecutionTime(1000L);
responseData.setHttpStatusCode(200);
responseData.setLastModified(new Date());
responseData.setMethod("GET");
responseData.setMimeType("text/html");
responseData.setParentUrl("http://fess.codelibs.org/");
responseData.setResponseBody(data.getBytes());
responseData.setSessionId("test-1");
responseData.setStatus(0);
responseData.setUrl("http://fess.codelibs.org/test.html");
ResultData resultData = fessXpathTransformer.transform(responseData);
// System.out.println(resultData.toString());
}
System.gc();
Thread.sleep(1000L);
logger.info(MemoryUtil.getMemoryUsageLog());
assertTrue(MemoryUtil.getUsedMemory() < 100000000L);
}
use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.
the class FessXpathTransformerTest method test_processMetaRobots_nofollow.
public void test_processMetaRobots_nofollow() throws Exception {
final String data = "<meta name=\"robots\" content=\"nofollow\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
transformer.processMetaRobots(responseData, new ResultData(), document);
assertTrue(responseData.isNoFollow());
}
use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.
the class FessXpathTransformerTest method test_processMetaRobots_none.
public void test_processMetaRobots_none() throws Exception {
final String data = "<meta name=\"robots\" content=\"none\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.
the class DocumentHelper method processRequest.
public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
if (StringUtil.isBlank(crawlingInfoId)) {
throw new CrawlingAccessException("sessionId is null.");
}
final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
crawlingConfig.initializeClientFactory(crawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new CrawlingAccessException("CrawlerClient is null for " + url);
}
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
final Set<RequestData> childUrlList = new HashSet<>();
childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
throw new ChildUrlsException(childUrlList, "Redirected from " + url);
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setSessionId(crawlingInfoId);
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
throw new CrawlingAccessException("No url rule for " + url);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> result = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
return result;
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
} else {
throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
}
}
return null;
} catch (final Exception e) {
throw new CrawlingAccessException("Failed to parse " + url, e);
}
}
Aggregations