Search in sources :

Example 1 with TikaExtractor

use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess-crawler by codelibs.

the class ExtractorFactoryTest method setUp.

@Override
protected void setUp() throws Exception {
    super.setUp();
    StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("tikaExtractor", // 
    TikaExtractor.class).singleton("pdfExtractor", // 
    PdfExtractor.class).singleton("lhaExtractor", // 
    LhaExtractor.class).singleton("extractorFactory", ExtractorFactory.class);
    extractorFactory = container.getComponent("extractorFactory");
    TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
    LhaExtractor lhaExtractor = container.getComponent("lhaExtractor");
    PasswordBasedExtractor pdfExtractor = container.getComponent("pdfExtractor");
    extractorFactory.addExtractor("application/msword", tikaExtractor);
    extractorFactory.addExtractor("application/vnd.ms-excel", tikaExtractor);
    extractorFactory.addExtractor("application/vnd.ms-powerpoint", tikaExtractor);
    extractorFactory.addExtractor("application/vnd.visio", tikaExtractor);
    extractorFactory.addExtractor("application/pdf", pdfExtractor);
    extractorFactory.addExtractor("application/x-lha", lhaExtractor);
    extractorFactory.addExtractor("application/x-lharc", lhaExtractor);
}
Also used : PasswordBasedExtractor(org.codelibs.fess.crawler.extractor.impl.PasswordBasedExtractor) LhaExtractor(org.codelibs.fess.crawler.extractor.impl.LhaExtractor) StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor)

Example 2 with TikaExtractor

use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess by codelibs.

the class DocumentHelper method init.

@PostConstruct
public void init() {
    if (logger.isDebugEnabled()) {
        logger.debug("Initialize {}", this.getClass().getSimpleName());
    }
    try {
        final TikaExtractor tikaExtractor = ComponentUtil.getComponent("tikaExtractor");
        if (tikaExtractor != null) {
            tikaExtractor.setMaxAlphanumTermSize(getMaxAlphanumTermSize());
            tikaExtractor.setMaxSymbolTermSize(getMaxSymbolTermSize());
            tikaExtractor.setReplaceDuplication(isDuplicateTermRemoved());
            tikaExtractor.setSpaceChars(getSpaceChars());
        }
    } catch (final ComponentNotFoundException e) {
        if (logger.isDebugEnabled()) {
            logger.debug("tikaExtractor is not found: {}", e.getMessage().replace('\n', ' '));
        }
    } catch (final Exception e) {
        logger.warn("Failed to initiaize TikaExtractor.", e);
    }
}
Also used : ComponentNotFoundException(org.lastaflute.di.core.exception.ComponentNotFoundException) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ComponentNotFoundException(org.lastaflute.di.core.exception.ComponentNotFoundException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) PostConstruct(javax.annotation.PostConstruct)

Example 3 with TikaExtractor

use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess-crawler by codelibs.

the class CrawlerTest method setUp.

@Override
protected void setUp() throws Exception {
    super.setUp();
    final Map<String, String> featureMap = newHashMap();
    featureMap.put("http://xml.org/sax/features/namespaces", "false");
    final Map<String, String> propertyMap = newHashMap();
    final Map<String, String> childUrlRuleMap = newHashMap();
    childUrlRuleMap.put("//A", "href");
    childUrlRuleMap.put("//AREA", "href");
    childUrlRuleMap.put("//FRAME", "src");
    childUrlRuleMap.put("//IFRAME", "src");
    childUrlRuleMap.put("//IMG", "src");
    childUrlRuleMap.put("//LINK", "href");
    childUrlRuleMap.put("//SCRIPT", "src");
    container = new StandardCrawlerContainer();
    container.<HcHttpClient>prototype("internalHttpClient", HcHttpClient.class, client -> {
        client.setCookieSpec(CookieSpecs.BEST_MATCH);
        client.setClientConnectionManager(container.getComponent("clientConnectionManager"));
    }).prototype("httpClient", FaultTolerantClient.class, client -> {
        client.setCrawlerClient(container.getComponent("internalHttpClient"));
        client.setMaxRetryCount(5);
        client.setRetryInterval(500);
    }).prototype("fsClient", FileSystemClient.class).prototype("ruleManager", RuleManagerImpl.class, manager -> {
        manager.addRule(container.getComponent("sitemapsRule"));
        manager.addRule(container.getComponent("fileRule"));
    }).prototype("accessResult", AccessResultImpl.class).prototype("urlQueue", UrlQueueImpl.class).prototype("crawlerThread", CrawlerThread.class).prototype("crawler", Crawler.class).prototype("urlFilterService", UrlFilterServiceImpl.class).prototype("urlQueueService", UrlQueueServiceImpl.class).prototype("dataService", DataServiceImpl.class).prototype("urlFilter", UrlFilterImpl.class).singleton("urlConvertHelper", UrlConvertHelper.class).singleton("intervalController", DefaultIntervalController.class).singleton("sitemapsHelper", SitemapsHelper.class).singleton("logHelper", LogHelperImpl.class).singleton("encodingHelper", EncodingHelper.class).singleton("contentLengthHelper", ContentLengthHelper.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).<FileTransformer>singleton("fileTransformer", FileTransformer.class, transformer -> {
        transformer.setName("fileTransformer");
        transformer.setFeatureMap(featureMap);
        transformer.setPropertyMap(propertyMap);
        transformer.setChildUrlRuleMap(childUrlRuleMap);
    }).singleton("dataHelper", MemoryDataHelper.class).singleton("robotsTxtHelper", RobotsTxtHelper.class).<CrawlerClientFactory>singleton("clientFactory", CrawlerClientFactory.class, factory -> {
        factory.addClient("http:.*", container.getComponent("httpClient"));
        factory.addClient("file:.*", container.getComponent("fsClient"));
    }).singleton("tikaExtractor", TikaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
        TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
        factory.addExtractor("text/plain", tikaExtractor);
        factory.addExtractor("text/html", tikaExtractor);
    }).singleton("httpClient", // 
    HcHttpClient.class).singleton("sitemapsResponseProcessor", // 
    SitemapsResponseProcessor.class).<SitemapsRule>singleton("sitemapsRule", SitemapsRule.class, rule -> {
        rule.setResponseProcessor(container.getComponent("sitemapsResponseProcessor"));
        rule.setRuleId("sitemapsRule");
        rule.addRule("url", ".*sitemap.*");
    }).<// 
    DefaultResponseProcessor>singleton("defaultResponseProcessor", DefaultResponseProcessor.class, processor -> {
        processor.setTransformer(container.getComponent("fileTransformer"));
        processor.setSuccessfulHttpCodes(new int[] { 200 });
        processor.setNotModifiedHttpCodes(new int[] { 304 });
    }).<// 
    RegexRule>singleton("fileRule", RegexRule.class, rule -> {
        rule.setRuleId("fileRule");
        rule.setDefaultRule(true);
        rule.setResponseProcessor(container.getComponent("defaultResponseProcessor"));
    }).<// 
    PoolingHttpClientConnectionManager>singleton("clientConnectionManager", new PoolingHttpClientConnectionManager(5, TimeUnit.MINUTES), manager -> {
        manager.setMaxTotal(200);
        manager.setDefaultMaxPerRoute(20);
    });
    crawler = container.getComponent("crawler");
    dataService = container.getComponent("dataService");
    urlQueueService = container.getComponent("urlQueueService");
    fileTransformer = container.getComponent("fileTransformer");
}
Also used : StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) MimeTypeHelperImpl(org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl) HcHttpClient(org.codelibs.fess.crawler.client.http.HcHttpClient) UrlQueueImpl(org.codelibs.fess.crawler.entity.UrlQueueImpl) PlainTestCase(org.dbflute.utflute.core.PlainTestCase) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) FileSystemClient(org.codelibs.fess.crawler.client.fs.FileSystemClient) DataService(org.codelibs.fess.crawler.service.DataService) CookieSpecs(org.apache.http.client.config.CookieSpecs) SitemapsHelper(org.codelibs.fess.crawler.helper.SitemapsHelper) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) RuleManagerImpl(org.codelibs.fess.crawler.rule.impl.RuleManagerImpl) DataServiceImpl(org.codelibs.fess.crawler.service.impl.DataServiceImpl) RegexRule(org.codelibs.fess.crawler.rule.impl.RegexRule) FaultTolerantClient(org.codelibs.fess.crawler.client.FaultTolerantClient) LogHelperImpl(org.codelibs.fess.crawler.helper.impl.LogHelperImpl) SitemapsRule(org.codelibs.fess.crawler.rule.impl.SitemapsRule) Map(java.util.Map) UrlFilterServiceImpl(org.codelibs.fess.crawler.service.impl.UrlFilterServiceImpl) PoolingHttpClientConnectionManager(org.apache.http.impl.conn.PoolingHttpClientConnectionManager) CrawlerWebServer(org.codelibs.fess.crawler.util.CrawlerWebServer) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) EncodingHelper(org.codelibs.fess.crawler.helper.EncodingHelper) RobotsTxtHelper(org.codelibs.fess.crawler.helper.RobotsTxtHelper) AccessResultImpl(org.codelibs.fess.crawler.entity.AccessResultImpl) SitemapsResponseProcessor(org.codelibs.fess.crawler.processor.impl.SitemapsResponseProcessor) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) MemoryDataHelper(org.codelibs.fess.crawler.helper.MemoryDataHelper) DefaultIntervalController(org.codelibs.fess.crawler.interval.impl.DefaultIntervalController) ResourceUtil(org.codelibs.core.io.ResourceUtil) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor) File(java.io.File) FileTransformer(org.codelibs.fess.crawler.transformer.impl.FileTransformer) TimeUnit(java.util.concurrent.TimeUnit) UrlQueueService(org.codelibs.fess.crawler.service.UrlQueueService) UrlConvertHelper(org.codelibs.fess.crawler.helper.UrlConvertHelper) UrlQueueServiceImpl(org.codelibs.fess.crawler.service.impl.UrlQueueServiceImpl) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) LogHelperImpl(org.codelibs.fess.crawler.helper.impl.LogHelperImpl) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) HcHttpClient(org.codelibs.fess.crawler.client.http.HcHttpClient) DefaultIntervalController(org.codelibs.fess.crawler.interval.impl.DefaultIntervalController) RobotsTxtHelper(org.codelibs.fess.crawler.helper.RobotsTxtHelper) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) UrlQueueImpl(org.codelibs.fess.crawler.entity.UrlQueueImpl) RuleManagerImpl(org.codelibs.fess.crawler.rule.impl.RuleManagerImpl) PoolingHttpClientConnectionManager(org.apache.http.impl.conn.PoolingHttpClientConnectionManager) FaultTolerantClient(org.codelibs.fess.crawler.client.FaultTolerantClient) FileTransformer(org.codelibs.fess.crawler.transformer.impl.FileTransformer) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) SitemapsRule(org.codelibs.fess.crawler.rule.impl.SitemapsRule) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor) RegexRule(org.codelibs.fess.crawler.rule.impl.RegexRule) UrlQueueServiceImpl(org.codelibs.fess.crawler.service.impl.UrlQueueServiceImpl)

Example 4 with TikaExtractor

use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess-crawler by codelibs.

the class TextTransformerTest method setUp.

@Override
protected void setUp() throws Exception {
    super.setUp();
    StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("extractorFactory", ExtractorFactory.class).singleton("textTransformer", TextTransformer.class).singleton("tikaExtractor", TikaExtractor.class);
    textTransformer = container.getComponent("textTransformer");
    textTransformer.setName("textTransformer");
    ExtractorFactory extractorFactory = container.getComponent("extractorFactory");
    TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
    extractorFactory.addExtractor("text/plain", tikaExtractor);
    extractorFactory.addExtractor("text/html", tikaExtractor);
}
Also used : ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor)

Example 5 with TikaExtractor

use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess by codelibs.

the class DocumentHelper method getContent.

public String getContent(final CrawlingConfig crawlingConfig, final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
    if (content == null) {
        // empty
        return StringUtil.EMPTY;
    }
    if (crawlingConfig != null) {
        final Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
        if (configParam != null && Constants.TRUE.equalsIgnoreCase(configParam.get(Param.Config.KEEP_ORIGINAL_BODY))) {
            return content;
        }
    }
    if (responseData.getMetaDataMap().get(Extractor.class.getSimpleName()) instanceof TikaExtractor) {
        return content;
    }
    final int maxAlphanumTermSize = getMaxAlphanumTermSize();
    final int maxSymbolTermSize = getMaxSymbolTermSize();
    final boolean duplicateTermRemoved = isDuplicateTermRemoved();
    final int[] spaceChars = getSpaceChars();
    try (final Reader reader = new StringReader(content)) {
        return TextUtil.normalizeText(reader).initialCapacity(content.length()).maxAlphanumTermSize(maxAlphanumTermSize).maxSymbolTermSize(maxSymbolTermSize).duplicateTermRemoved(duplicateTermRemoved).spaceChars(spaceChars).execute();
    } catch (final IOException e) {
        // empty
        return StringUtil.EMPTY;
    }
}
Also used : StringReader(java.io.StringReader) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor)

Aggregations

TikaExtractor (org.codelibs.fess.crawler.extractor.impl.TikaExtractor)5 StandardCrawlerContainer (org.codelibs.fess.crawler.container.StandardCrawlerContainer)3 IOException (java.io.IOException)2 ExtractorFactory (org.codelibs.fess.crawler.extractor.ExtractorFactory)2 BufferedReader (java.io.BufferedReader)1 File (java.io.File)1 InputStreamReader (java.io.InputStreamReader)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 Map (java.util.Map)1 TimeUnit (java.util.concurrent.TimeUnit)1 PostConstruct (javax.annotation.PostConstruct)1 CookieSpecs (org.apache.http.client.config.CookieSpecs)1 PoolingHttpClientConnectionManager (org.apache.http.impl.conn.PoolingHttpClientConnectionManager)1 ResourceUtil (org.codelibs.core.io.ResourceUtil)1 CrawlerClientFactory (org.codelibs.fess.crawler.client.CrawlerClientFactory)1 FaultTolerantClient (org.codelibs.fess.crawler.client.FaultTolerantClient)1 FileSystemClient (org.codelibs.fess.crawler.client.fs.FileSystemClient)1 HcHttpClient (org.codelibs.fess.crawler.client.http.HcHttpClient)1 AccessResultImpl (org.codelibs.fess.crawler.entity.AccessResultImpl)1