use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess-crawler by codelibs.
the class ExtractorFactoryTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("tikaExtractor", //
TikaExtractor.class).singleton("pdfExtractor", //
PdfExtractor.class).singleton("lhaExtractor", //
LhaExtractor.class).singleton("extractorFactory", ExtractorFactory.class);
extractorFactory = container.getComponent("extractorFactory");
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
LhaExtractor lhaExtractor = container.getComponent("lhaExtractor");
PasswordBasedExtractor pdfExtractor = container.getComponent("pdfExtractor");
extractorFactory.addExtractor("application/msword", tikaExtractor);
extractorFactory.addExtractor("application/vnd.ms-excel", tikaExtractor);
extractorFactory.addExtractor("application/vnd.ms-powerpoint", tikaExtractor);
extractorFactory.addExtractor("application/vnd.visio", tikaExtractor);
extractorFactory.addExtractor("application/pdf", pdfExtractor);
extractorFactory.addExtractor("application/x-lha", lhaExtractor);
extractorFactory.addExtractor("application/x-lharc", lhaExtractor);
}
use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess by codelibs.
the class DocumentHelper method init.
@PostConstruct
public void init() {
if (logger.isDebugEnabled()) {
logger.debug("Initialize {}", this.getClass().getSimpleName());
}
try {
final TikaExtractor tikaExtractor = ComponentUtil.getComponent("tikaExtractor");
if (tikaExtractor != null) {
tikaExtractor.setMaxAlphanumTermSize(getMaxAlphanumTermSize());
tikaExtractor.setMaxSymbolTermSize(getMaxSymbolTermSize());
tikaExtractor.setReplaceDuplication(isDuplicateTermRemoved());
tikaExtractor.setSpaceChars(getSpaceChars());
}
} catch (final ComponentNotFoundException e) {
if (logger.isDebugEnabled()) {
logger.debug("tikaExtractor is not found: {}", e.getMessage().replace('\n', ' '));
}
} catch (final Exception e) {
logger.warn("Failed to initiaize TikaExtractor.", e);
}
}
use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess-crawler by codelibs.
the class CrawlerTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
final Map<String, String> featureMap = newHashMap();
featureMap.put("http://xml.org/sax/features/namespaces", "false");
final Map<String, String> propertyMap = newHashMap();
final Map<String, String> childUrlRuleMap = newHashMap();
childUrlRuleMap.put("//A", "href");
childUrlRuleMap.put("//AREA", "href");
childUrlRuleMap.put("//FRAME", "src");
childUrlRuleMap.put("//IFRAME", "src");
childUrlRuleMap.put("//IMG", "src");
childUrlRuleMap.put("//LINK", "href");
childUrlRuleMap.put("//SCRIPT", "src");
container = new StandardCrawlerContainer();
container.<HcHttpClient>prototype("internalHttpClient", HcHttpClient.class, client -> {
client.setCookieSpec(CookieSpecs.BEST_MATCH);
client.setClientConnectionManager(container.getComponent("clientConnectionManager"));
}).prototype("httpClient", FaultTolerantClient.class, client -> {
client.setCrawlerClient(container.getComponent("internalHttpClient"));
client.setMaxRetryCount(5);
client.setRetryInterval(500);
}).prototype("fsClient", FileSystemClient.class).prototype("ruleManager", RuleManagerImpl.class, manager -> {
manager.addRule(container.getComponent("sitemapsRule"));
manager.addRule(container.getComponent("fileRule"));
}).prototype("accessResult", AccessResultImpl.class).prototype("urlQueue", UrlQueueImpl.class).prototype("crawlerThread", CrawlerThread.class).prototype("crawler", Crawler.class).prototype("urlFilterService", UrlFilterServiceImpl.class).prototype("urlQueueService", UrlQueueServiceImpl.class).prototype("dataService", DataServiceImpl.class).prototype("urlFilter", UrlFilterImpl.class).singleton("urlConvertHelper", UrlConvertHelper.class).singleton("intervalController", DefaultIntervalController.class).singleton("sitemapsHelper", SitemapsHelper.class).singleton("logHelper", LogHelperImpl.class).singleton("encodingHelper", EncodingHelper.class).singleton("contentLengthHelper", ContentLengthHelper.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).<FileTransformer>singleton("fileTransformer", FileTransformer.class, transformer -> {
transformer.setName("fileTransformer");
transformer.setFeatureMap(featureMap);
transformer.setPropertyMap(propertyMap);
transformer.setChildUrlRuleMap(childUrlRuleMap);
}).singleton("dataHelper", MemoryDataHelper.class).singleton("robotsTxtHelper", RobotsTxtHelper.class).<CrawlerClientFactory>singleton("clientFactory", CrawlerClientFactory.class, factory -> {
factory.addClient("http:.*", container.getComponent("httpClient"));
factory.addClient("file:.*", container.getComponent("fsClient"));
}).singleton("tikaExtractor", TikaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
factory.addExtractor("text/plain", tikaExtractor);
factory.addExtractor("text/html", tikaExtractor);
}).singleton("httpClient", //
HcHttpClient.class).singleton("sitemapsResponseProcessor", //
SitemapsResponseProcessor.class).<SitemapsRule>singleton("sitemapsRule", SitemapsRule.class, rule -> {
rule.setResponseProcessor(container.getComponent("sitemapsResponseProcessor"));
rule.setRuleId("sitemapsRule");
rule.addRule("url", ".*sitemap.*");
}).<//
DefaultResponseProcessor>singleton("defaultResponseProcessor", DefaultResponseProcessor.class, processor -> {
processor.setTransformer(container.getComponent("fileTransformer"));
processor.setSuccessfulHttpCodes(new int[] { 200 });
processor.setNotModifiedHttpCodes(new int[] { 304 });
}).<//
RegexRule>singleton("fileRule", RegexRule.class, rule -> {
rule.setRuleId("fileRule");
rule.setDefaultRule(true);
rule.setResponseProcessor(container.getComponent("defaultResponseProcessor"));
}).<//
PoolingHttpClientConnectionManager>singleton("clientConnectionManager", new PoolingHttpClientConnectionManager(5, TimeUnit.MINUTES), manager -> {
manager.setMaxTotal(200);
manager.setDefaultMaxPerRoute(20);
});
crawler = container.getComponent("crawler");
dataService = container.getComponent("dataService");
urlQueueService = container.getComponent("urlQueueService");
fileTransformer = container.getComponent("fileTransformer");
}
use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess-crawler by codelibs.
the class TextTransformerTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("extractorFactory", ExtractorFactory.class).singleton("textTransformer", TextTransformer.class).singleton("tikaExtractor", TikaExtractor.class);
textTransformer = container.getComponent("textTransformer");
textTransformer.setName("textTransformer");
ExtractorFactory extractorFactory = container.getComponent("extractorFactory");
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
extractorFactory.addExtractor("text/plain", tikaExtractor);
extractorFactory.addExtractor("text/html", tikaExtractor);
}
use of org.codelibs.fess.crawler.extractor.impl.TikaExtractor in project fess by codelibs.
the class DocumentHelper method getContent.
public String getContent(final CrawlingConfig crawlingConfig, final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
if (content == null) {
// empty
return StringUtil.EMPTY;
}
if (crawlingConfig != null) {
final Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
if (configParam != null && Constants.TRUE.equalsIgnoreCase(configParam.get(Param.Config.KEEP_ORIGINAL_BODY))) {
return content;
}
}
if (responseData.getMetaDataMap().get(Extractor.class.getSimpleName()) instanceof TikaExtractor) {
return content;
}
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
final int maxSymbolTermSize = getMaxSymbolTermSize();
final boolean duplicateTermRemoved = isDuplicateTermRemoved();
final int[] spaceChars = getSpaceChars();
try (final Reader reader = new StringReader(content)) {
return TextUtil.normalizeText(reader).initialCapacity(content.length()).maxAlphanumTermSize(maxAlphanumTermSize).maxSymbolTermSize(maxSymbolTermSize).duplicateTermRemoved(duplicateTermRemoved).spaceChars(spaceChars).execute();
} catch (final IOException e) {
// empty
return StringUtil.EMPTY;
}
}
Aggregations