Search in sources :

Example 6 with ResultData

use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.

the class FessXpathTransformerTest method test_transform.

public void test_transform() throws Exception {
    String data = "<html><head><title>Test</title></head><body><h1>Header1</h1><p>This is a pen.</p></body></html>";
    final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
    fessXpathTransformer.init();
    SingletonLaContainerFactory.getContainer().register(CrawlingInfoHelper.class, "crawlingInfoHelper");
    SingletonLaContainerFactory.getContainer().register(PathMappingHelper.class, "pathMappingHelper");
    SingletonLaContainerFactory.getContainer().register(CrawlingConfigHelper.class, "crawlingConfigHelper");
    SingletonLaContainerFactory.getContainer().register(SystemHelper.class, "systemHelper");
    SingletonLaContainerFactory.getContainer().register(FileTypeHelper.class, "fileTypeHelper");
    SingletonLaContainerFactory.getContainer().register(DocumentHelper.class, "documentHelper");
    SingletonLaContainerFactory.getContainer().register(LabelTypeHelper.class, "labelTypeHelper");
    WebConfig webConfig = new WebConfig();
    setValueToObject(webConfig, "labelTypeList", new ArrayList<LabelType>());
    ComponentUtil.getCrawlingConfigHelper().store("test", webConfig);
    setValueToObject(ComponentUtil.getLabelTypeHelper(), "labelTypePatternList", new ArrayList<LabelTypePattern>());
    for (int i = 0; i < 10000; i++) {
        if (i % 1000 == 0) {
            logger.info(MemoryUtil.getMemoryUsageLog() + ":" + i);
            System.gc();
        }
        ResponseData responseData = new ResponseData();
        responseData.setCharSet("UTF-8");
        responseData.setContentLength(data.length());
        responseData.setExecutionTime(1000L);
        responseData.setHttpStatusCode(200);
        responseData.setLastModified(new Date());
        responseData.setMethod("GET");
        responseData.setMimeType("text/html");
        responseData.setParentUrl("http://fess.codelibs.org/");
        responseData.setResponseBody(data.getBytes());
        responseData.setSessionId("test-1");
        responseData.setStatus(0);
        responseData.setUrl("http://fess.codelibs.org/test.html");
        ResultData resultData = fessXpathTransformer.transform(responseData);
    // System.out.println(resultData.toString());
    }
    System.gc();
    Thread.sleep(1000L);
    logger.info(MemoryUtil.getMemoryUsageLog());
    assertTrue(MemoryUtil.getUsedMemory() < 100000000L);
}
Also used : ResultData(org.codelibs.fess.crawler.entity.ResultData) LabelType(org.codelibs.fess.es.config.exentity.LabelType) LabelTypePattern(org.codelibs.fess.helper.LabelTypeHelper.LabelTypePattern) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) WebConfig(org.codelibs.fess.es.config.exentity.WebConfig) Date(java.util.Date)

Example 7 with ResultData

use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.

the class FessXpathTransformerTest method test_processMetaRobots_nofollow.

public void test_processMetaRobots_nofollow() throws Exception {
    final String data = "<meta name=\"robots\" content=\"nofollow\" />";
    final Document document = getDocument(data);
    final FessXpathTransformer transformer = new FessXpathTransformer();
    final ResponseData responseData = new ResponseData();
    responseData.setUrl("http://example.com/");
    transformer.processMetaRobots(responseData, new ResultData(), document);
    assertTrue(responseData.isNoFollow());
}
Also used : ResultData(org.codelibs.fess.crawler.entity.ResultData) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) Document(org.w3c.dom.Document)

Example 8 with ResultData

use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.

the class FessXpathTransformerTest method test_processMetaRobots_none.

public void test_processMetaRobots_none() throws Exception {
    final String data = "<meta name=\"robots\" content=\"none\" />";
    final Document document = getDocument(data);
    final FessXpathTransformer transformer = new FessXpathTransformer();
    final ResponseData responseData = new ResponseData();
    responseData.setUrl("http://example.com/");
    try {
        transformer.processMetaRobots(responseData, new ResultData(), document);
        fail();
    } catch (ChildUrlsException e) {
        assertTrue(e.getChildUrlList().isEmpty());
    } catch (Exception e) {
        fail();
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) Document(org.w3c.dom.Document) ComponentNotFoundException(org.lastaflute.di.core.exception.ComponentNotFoundException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException)

Example 9 with ResultData

use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.

the class DocumentHelper method processRequest.

public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
    if (StringUtil.isBlank(crawlingInfoId)) {
        throw new CrawlingAccessException("sessionId is null.");
    }
    final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
    crawlingConfig.initializeClientFactory(crawlerClientFactory);
    final CrawlerClient client = crawlerClientFactory.getClient(url);
    if (client == null) {
        throw new CrawlingAccessException("CrawlerClient is null for " + url);
    }
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            final Set<RequestData> childUrlList = new HashSet<>();
            childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
            throw new ChildUrlsException(childUrlList, "Redirected from " + url);
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        responseData.setSessionId(crawlingInfoId);
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            throw new CrawlingAccessException("No url rule for " + url);
        } else {
            responseData.setRuleId(rule.getRuleId());
            final ResponseProcessor responseProcessor = rule.getResponseProcessor();
            if (responseProcessor instanceof DefaultResponseProcessor) {
                final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
                final ResultData resultData = transformer.transform(responseData);
                final byte[] data = resultData.getData();
                if (data != null) {
                    try {
                        @SuppressWarnings("unchecked") final Map<String, Object> result = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
                        return result;
                    } catch (final Exception e) {
                        throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                    }
                }
            } else {
                throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
            }
        }
        return null;
    } catch (final Exception e) {
        throw new CrawlingAccessException("Failed to parse " + url, e);
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) RequestData(org.codelibs.fess.crawler.entity.RequestData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map) HashSet(java.util.HashSet)

Aggregations

ResultData (org.codelibs.fess.crawler.entity.ResultData)9 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)8 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)5 Document (org.w3c.dom.Document)5 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)3 ComponentNotFoundException (org.lastaflute.di.core.exception.ComponentNotFoundException)3 Map (java.util.Map)2 CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)2 CrawlerClientFactory (org.codelibs.fess.crawler.client.CrawlerClientFactory)2 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)2 ResponseProcessor (org.codelibs.fess.crawler.processor.ResponseProcessor)2 DefaultResponseProcessor (org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor)2 Rule (org.codelibs.fess.crawler.rule.Rule)2 RuleManager (org.codelibs.fess.crawler.rule.RuleManager)2 Transformer (org.codelibs.fess.crawler.transformer.Transformer)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 HashSet (java.util.HashSet)1 List (java.util.List)1