Search in sources :

Example 6 with Extractor

use of org.codelibs.fess.crawler.extractor.Extractor in project fess by codelibs.

the class FessFileTransformer method getExtractor.

@Override
protected Extractor getExtractor(final ResponseData responseData) {
    final ExtractorFactory extractorFactory = ComponentUtil.getExtractorFactory();
    if (extractorFactory == null) {
        throw new FessSystemException("Could not find extractorFactory.");
    }
    final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
    if (logger.isDebugEnabled()) {
        logger.debug("url={}, extractor={}", responseData.getUrl(), extractor);
    }
    return extractor;
}
Also used : ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) Extractor(org.codelibs.fess.crawler.extractor.Extractor) FessSystemException(org.codelibs.fess.exception.FessSystemException)

Example 7 with Extractor

use of org.codelibs.fess.crawler.extractor.Extractor in project fess by codelibs.

the class AbstractFessFileTransformer method generateData.

protected Map<String, Object> generateData(final ResponseData responseData) {
    final Extractor extractor = getExtractor(responseData);
    final Map<String, String> params = new HashMap<>();
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    final String mimeType = responseData.getMimeType();
    params.put(HttpHeaders.CONTENT_TYPE, mimeType);
    params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
    final StringBuilder contentMetaBuf = new StringBuilder(1000);
    final Map<String, Object> dataMap = new HashMap<>();
    final Map<String, Object> metaDataMap = new HashMap<>();
    String content;
    try (final InputStream in = responseData.getResponseBody()) {
        final ExtractData extractData = getExtractData(extractor, in, params);
        content = extractData.getContent();
        if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
            return null;
        }
        if (getLogger().isDebugEnabled()) {
            getLogger().debug("ExtractData: " + extractData);
        }
        // meta
        //
        extractData.getKeySet().stream().filter(//
        k -> extractData.getValues(k) != null).forEach(key -> {
            final String[] values = extractData.getValues(key);
            metaDataMap.put(key, values);
            if (fessConfig.isCrawlerMetadataContentIncluded(key)) {
                final String joinedValue = StringUtils.join(values, ' ');
                if (StringUtil.isNotBlank(joinedValue)) {
                    if (contentMetaBuf.length() > 0) {
                        contentMetaBuf.append(' ');
                    }
                    contentMetaBuf.append(joinedValue.trim());
                }
            }
            final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key);
            if (mapping != null) {
                if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) {
                    dataMap.put(mapping.getFirst(), values);
                } else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) {
                    final String joinedValue = StringUtils.join(values, ' ');
                    dataMap.put(mapping.getFirst(), joinedValue.trim());
                } else if (values.length == 1) {
                    try {
                        if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Long.parseLong(values[0]));
                        } else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Double.parseDouble(values[0]));
                        } else {
                            logger.warn("Unknown mapping type: {}={}", key, mapping);
                        }
                    } catch (final NumberFormatException e) {
                        logger.warn("Failed to parse " + values[0], e);
                    }
                }
            }
        });
    } catch (final Exception e) {
        final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
        rcae.setLogLevel(CrawlingAccessException.WARN);
        throw rcae;
    }
    if (content == null) {
        content = StringUtil.EMPTY;
    }
    final String contentMeta = contentMetaBuf.toString().trim();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // content
    final StringBuilder buf = new StringBuilder(content.length() + 1000);
    if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
        buf.append(content);
    }
    if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
        if (buf.length() > 0) {
            buf.append(' ');
        }
        buf.append(contentMeta);
    }
    final String bodyBase = buf.toString().trim();
    final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
            // text cache
            putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
            putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
        }
    }
    // digest
    putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
    // title
    final String fileName = getFileName(url, urlEncoding);
    if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
        if (url.endsWith("/")) {
            if (StringUtil.isNotBlank(content)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
            }
        } else {
            if (StringUtil.isBlank(fileName)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
            }
        }
    }
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
    // filename
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // TODO anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY);
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    //  boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    final Set<String> labelTypeSet = new HashSet<>();
    for (final String labelType : crawlingConfig.getLabelTypeValues()) {
        labelTypeSet.add(labelType);
    }
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
    // role: roleType
    final List<String> roleTypeList = getRoleTypes(responseData);
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // lang
    if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
    }
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // from config
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META);
    for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) {
        final String key = entry.getKey();
        final String[] values = entry.getValue().split(",");
        for (final String value : values) {
            putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key));
        }
    }
    final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE);
    for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) {
        final String key = entry.getKey();
        putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key));
    }
    return dataMap;
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) URLDecoder(java.net.URLDecoder) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) Pair(org.codelibs.core.misc.Pair) HashMap(java.util.HashMap) TikaMetadataKeys(org.apache.tika.metadata.TikaMetadataKeys) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) StringUtils(org.apache.commons.lang3.StringUtils) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) AbstractTransformer(org.codelibs.fess.crawler.transformer.impl.AbstractTransformer) SambaHelper(org.codelibs.fess.helper.SambaHelper) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) Extractor(org.codelibs.fess.crawler.extractor.Extractor) StringUtil(org.codelibs.core.lang.StringUtil) SID(jcifs.smb.SID) Set(java.util.Set) List(java.util.List) ACE(jcifs.smb.ACE) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) HttpHeaders(org.apache.tika.metadata.HttpHeaders) SmbClient(org.codelibs.fess.crawler.client.smb.SmbClient) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) HashMap(java.util.HashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) Extractor(org.codelibs.fess.crawler.extractor.Extractor) HashSet(java.util.HashSet) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) InputStream(java.io.InputStream) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashMap(java.util.HashMap) Map(java.util.Map)

Example 8 with Extractor

use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.

the class TikaExtractor method getText.

@Override
public ExtractData getText(final InputStream inputStream, final Map<String, String> params) {
    if (inputStream == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }
    final File tempFile;
    final boolean isByteStream = inputStream instanceof ByteArrayInputStream;
    if (isByteStream) {
        inputStream.mark(0);
        tempFile = null;
    } else {
        try {
            tempFile = File.createTempFile("tikaExtractor-", ".out");
        } catch (final IOException e) {
            throw new ExtractException("Could not create a temp file.", e);
        }
    }
    try {
        final PrintStream originalOutStream = System.out;
        final ByteArrayOutputStream outStream = new ByteArrayOutputStream();
        System.setOut(new PrintStream(outStream, true));
        final PrintStream originalErrStream = System.err;
        final ByteArrayOutputStream errStream = new ByteArrayOutputStream();
        System.setErr(new PrintStream(errStream, true));
        try {
            final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
            final String contentType = params == null ? null : params.get(HttpHeaders.CONTENT_TYPE);
            String contentEncoding = params == null ? null : params.get(HttpHeaders.CONTENT_ENCODING);
            String pdfPassword = getPassword(params);
            final Metadata metadata = createMetadata(resourceName, contentType, contentEncoding, pdfPassword);
            final Parser parser = new TikaDetectParser();
            final ParseContext parseContext = createParseContext(parser, params);
            String content = getContent(writer -> {
                InputStream in = null;
                try {
                    if (!isByteStream) {
                        try (OutputStream out = new FileOutputStream(tempFile)) {
                            CopyUtil.copy(inputStream, out);
                        }
                        in = new FileInputStream(tempFile);
                    } else {
                        in = inputStream;
                    }
                    parser.parse(in, new BodyContentHandler(writer), metadata, parseContext);
                } finally {
                    CloseableUtil.closeQuietly(in);
                }
            }, contentEncoding);
            if (StringUtil.isBlank(content)) {
                if (resourceName != null) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("retry without a resource name: {}", resourceName);
                    }
                    final Metadata metadata2 = createMetadata(null, contentType, contentEncoding, pdfPassword);
                    content = getContent(writer -> {
                        InputStream in = null;
                        try {
                            if (isByteStream) {
                                inputStream.reset();
                                in = inputStream;
                            } else {
                                in = new FileInputStream(tempFile);
                            }
                            parser.parse(in, new BodyContentHandler(writer), metadata2, parseContext);
                        } finally {
                            CloseableUtil.closeQuietly(in);
                        }
                    }, contentEncoding);
                }
                if (StringUtil.isBlank(content) && contentType != null) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("retry without a content type: {}", contentType);
                    }
                    final Metadata metadata3 = createMetadata(null, null, contentEncoding, pdfPassword);
                    content = getContent(writer -> {
                        InputStream in = null;
                        try {
                            if (isByteStream) {
                                inputStream.reset();
                                in = inputStream;
                            } else {
                                in = new FileInputStream(tempFile);
                            }
                            parser.parse(in, new BodyContentHandler(writer), metadata3, parseContext);
                        } finally {
                            CloseableUtil.closeQuietly(in);
                        }
                    }, contentEncoding);
                }
                if (readAsTextIfFailed && StringUtil.isBlank(content)) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("read the content as a text.");
                    }
                    if (contentEncoding == null) {
                        contentEncoding = Constants.UTF_8;
                    }
                    final String enc = contentEncoding;
                    content = getContent(writer -> {
                        BufferedReader br = null;
                        try {
                            if (isByteStream) {
                                inputStream.reset();
                                br = new BufferedReader(new InputStreamReader(inputStream, enc));
                            } else {
                                br = new BufferedReader(new InputStreamReader(new FileInputStream(tempFile), enc));
                            }
                            String line;
                            while ((line = br.readLine()) != null) {
                                writer.write(line);
                            }
                        } catch (final Exception e) {
                            logger.warn("Could not read " + (tempFile != null ? tempFile.getAbsolutePath() : "a byte stream"), e);
                        } finally {
                            CloseableUtil.closeQuietly(br);
                        }
                    }, contentEncoding);
                }
            }
            final ExtractData extractData = new ExtractData(content);
            final String[] names = metadata.names();
            Arrays.sort(names);
            for (final String name : names) {
                extractData.putValues(name, metadata.getValues(name));
            }
            if (logger.isDebugEnabled()) {
                logger.debug("Result: metadata: {}", metadata);
            }
            return extractData;
        } catch (final TikaException e) {
            if (e.getMessage().indexOf("bomb") >= 0) {
                throw e;
            }
            final Throwable cause = e.getCause();
            if (cause instanceof SAXException) {
                final Extractor xmlExtractor = crawlerContainer.getComponent("xmlExtractor");
                if (xmlExtractor != null) {
                    InputStream in = null;
                    try {
                        if (isByteStream) {
                            inputStream.reset();
                            in = inputStream;
                        } else {
                            in = new FileInputStream(tempFile);
                        }
                        return xmlExtractor.getText(in, params);
                    } finally {
                        CloseableUtil.closeQuietly(in);
                    }
                }
            }
            throw e;
        } finally {
            if (originalOutStream != null) {
                System.setOut(originalOutStream);
            }
            if (originalErrStream != null) {
                System.setErr(originalErrStream);
            }
            try {
                if (logger.isInfoEnabled()) {
                    final byte[] bs = outStream.toByteArray();
                    if (bs.length != 0) {
                        logger.info(new String(bs, outputEncoding));
                    }
                }
                if (logger.isWarnEnabled()) {
                    final byte[] bs = errStream.toByteArray();
                    if (bs.length != 0) {
                        logger.warn(new String(bs, outputEncoding));
                    }
                }
            } catch (final Exception e) {
            // NOP
            }
        }
    } catch (final Exception e) {
        throw new ExtractException("Could not extract a content.", e);
    } finally {
        if (tempFile != null && !tempFile.delete()) {
            logger.warn("Failed to delete " + tempFile.getAbsolutePath());
        }
    }
}
Also used : Arrays(java.util.Arrays) BufferedInputStream(java.io.BufferedInputStream) Parser(org.apache.tika.parser.Parser) TesseractOCRConfig(org.apache.tika.parser.ocr.TesseractOCRConfig) LoggerFactory(org.slf4j.LoggerFactory) TikaMetadataKeys(org.apache.tika.metadata.TikaMetadataKeys) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Metadata(org.apache.tika.metadata.Metadata) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) CopyUtil(org.codelibs.core.io.CopyUtil) ParsingEmbeddedDocumentExtractor(org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor) TemporaryResources(org.apache.tika.io.TemporaryResources) Extractor(org.codelibs.fess.crawler.extractor.Extractor) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) CompositeParser(org.apache.tika.parser.CompositeParser) ExtractException(org.codelibs.fess.crawler.exception.ExtractException) Reader(java.io.Reader) SecureContentHandler(org.apache.tika.sax.SecureContentHandler) ParseContext(org.apache.tika.parser.ParseContext) SAXException(org.xml.sax.SAXException) Writer(java.io.Writer) PostConstruct(javax.annotation.PostConstruct) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaConfig(org.apache.tika.config.TikaConfig) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) MediaType(org.apache.tika.mime.MediaType) TikaException(org.apache.tika.exception.TikaException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) PasswordProvider(org.apache.tika.parser.PasswordProvider) OutputStreamWriter(java.io.OutputStreamWriter) TikaInputStream(org.apache.tika.io.TikaInputStream) ContentHandler(org.xml.sax.ContentHandler) OutputStream(java.io.OutputStream) PrintStream(java.io.PrintStream) Logger(org.slf4j.Logger) BufferedWriter(java.io.BufferedWriter) DeferredFileOutputStream(org.apache.commons.io.output.DeferredFileOutputStream) PDFParserConfig(org.apache.tika.parser.pdf.PDFParserConfig) StringUtil(org.codelibs.core.lang.StringUtil) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) Detector(org.apache.tika.detect.Detector) InputStreamReader(java.io.InputStreamReader) File(java.io.File) CloseableUtil(org.codelibs.core.io.CloseableUtil) TextUtil(org.codelibs.fess.crawler.util.TextUtil) Constants(org.codelibs.fess.crawler.Constants) BufferedReader(java.io.BufferedReader) HttpHeaders(org.apache.tika.metadata.HttpHeaders) InputStream(java.io.InputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) OutputStream(java.io.OutputStream) DeferredFileOutputStream(org.apache.commons.io.output.DeferredFileOutputStream) FileOutputStream(java.io.FileOutputStream) Metadata(org.apache.tika.metadata.Metadata) SAXException(org.xml.sax.SAXException) ParsingEmbeddedDocumentExtractor(org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor) Extractor(org.codelibs.fess.crawler.extractor.Extractor) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) PrintStream(java.io.PrintStream) ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) TikaException(org.apache.tika.exception.TikaException) InputStreamReader(java.io.InputStreamReader) BufferedInputStream(java.io.BufferedInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) FileInputStream(java.io.FileInputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ExtractException(org.codelibs.fess.crawler.exception.ExtractException) SAXException(org.xml.sax.SAXException) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) ByteArrayInputStream(java.io.ByteArrayInputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DeferredFileOutputStream(org.apache.commons.io.output.DeferredFileOutputStream) FileOutputStream(java.io.FileOutputStream) ParseContext(org.apache.tika.parser.ParseContext) BufferedReader(java.io.BufferedReader) File(java.io.File)

Example 9 with Extractor

use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.

the class JodExtractor method getOutputContent.

protected String getOutputContent(final File outputFile, final String outExt) {
    final Extractor extractor = getExtractor(outExt);
    if (extractor != null) {
        final Map<String, String> params = new HashMap<>();
        params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, outputFile.getName());
        FileInputStream in = null;
        try {
            in = new FileInputStream(outputFile);
            final ExtractData extractData = extractor.getText(in, params);
            return extractData.getContent();
        } catch (final FileNotFoundException e) {
            throw new ExtractException("Could not open " + outputFile.getAbsolutePath(), e);
        } finally {
            CloseableUtil.closeQuietly(in);
        }
    }
    try {
        return new String(FileUtil.readBytes(outputFile), outputEncoding);
    } catch (final UnsupportedEncodingException e) {
        return new String(FileUtil.readBytes(outputFile), Constants.UTF_8_CHARSET);
    }
}
Also used : ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) HashMap(java.util.HashMap) FileNotFoundException(java.io.FileNotFoundException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Extractor(org.codelibs.fess.crawler.extractor.Extractor) FileInputStream(java.io.FileInputStream)

Example 10 with Extractor

use of org.codelibs.fess.crawler.extractor.Extractor in project fess-crawler by codelibs.

the class LhaExtractor method getText.

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }
    final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
    final ExtractorFactory extractorFactory = getExtractorFactory();
    final StringBuilder buf = new StringBuilder(1000);
    File tempFile = null;
    LhaFile lhaFile = null;
    try {
        tempFile = File.createTempFile("crawler-", ".lzh");
        try (FileOutputStream fos = new FileOutputStream(tempFile)) {
            CopyUtil.copy(in, fos);
        }
        lhaFile = new LhaFile(tempFile);
        @SuppressWarnings("unchecked") final Enumeration<LhaHeader> entries = lhaFile.entries();
        long contentSize = 0;
        while (entries.hasMoreElements()) {
            final LhaHeader head = entries.nextElement();
            contentSize += head.getOriginalSize();
            if (maxContentSize != -1 && contentSize > maxContentSize) {
                throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
            }
            final String filename = head.getPath();
            final String mimeType = mimeTypeHelper.getContentType(null, filename);
            if (mimeType != null) {
                final Extractor extractor = extractorFactory.getExtractor(mimeType);
                if (extractor != null) {
                    InputStream is = null;
                    try {
                        is = lhaFile.getInputStream(head);
                        final Map<String, String> map = new HashMap<>();
                        map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
                        buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
                        buf.append('\n');
                    } catch (final Exception e) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("Exception in an internal extractor.", e);
                        }
                    } finally {
                        CloseableUtil.closeQuietly(is);
                    }
                }
            }
        }
    } catch (final MaxLengthExceededException e) {
        throw e;
    } catch (final Exception e) {
        throw new ExtractException("Could not extract a content.", e);
    } finally {
        if (lhaFile != null) {
            try {
                lhaFile.close();
            } catch (final IOException e) {
            // ignore
            }
        }
        if (tempFile != null && !tempFile.delete()) {
            logger.warn("Failed to delete " + tempFile.getAbsolutePath());
        }
    }
    return new ExtractData(buf.toString().trim());
}
Also used : ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) HashMap(java.util.HashMap) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) IgnoreCloseInputStream(org.codelibs.fess.crawler.util.IgnoreCloseInputStream) InputStream(java.io.InputStream) LhaFile(jp.gr.java_conf.dangan.util.lha.LhaFile) IOException(java.io.IOException) IOException(java.io.IOException) ExtractException(org.codelibs.fess.crawler.exception.ExtractException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) LhaHeader(jp.gr.java_conf.dangan.util.lha.LhaHeader) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) FileOutputStream(java.io.FileOutputStream) Extractor(org.codelibs.fess.crawler.extractor.Extractor) LhaFile(jp.gr.java_conf.dangan.util.lha.LhaFile) File(java.io.File) IgnoreCloseInputStream(org.codelibs.fess.crawler.util.IgnoreCloseInputStream)

Aggregations

Extractor (org.codelibs.fess.crawler.extractor.Extractor)10 HashMap (java.util.HashMap)7 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)6 ExtractException (org.codelibs.fess.crawler.exception.ExtractException)6 InputStream (java.io.InputStream)5 ExtractData (org.codelibs.fess.crawler.entity.ExtractData)5 ExtractorFactory (org.codelibs.fess.crawler.extractor.ExtractorFactory)5 IOException (java.io.IOException)3 UnsupportedEncodingException (java.io.UnsupportedEncodingException)3 BufferedInputStream (java.io.BufferedInputStream)2 File (java.io.File)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 Map (java.util.Map)2 ArchiveInputStream (org.apache.commons.compress.archivers.ArchiveInputStream)2 HttpHeaders (org.apache.tika.metadata.HttpHeaders)2 TikaMetadataKeys (org.apache.tika.metadata.TikaMetadataKeys)2 StringUtil (org.codelibs.core.lang.StringUtil)2 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)2 MimeTypeHelper (org.codelibs.fess.crawler.helper.MimeTypeHelper)2