Search in sources :

Example 11 with ExtractException

use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.

the class CommandExtractor method getText.

/*
     * (non-Javadoc)
     *
     * @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
     * java.util.Map)
     */
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
    String extention;
    String filePrefix;
    if (StringUtil.isNotBlank(resourceName)) {
        final String name = getFileName(resourceName);
        final String[] strings = name.split("\\.");
        final StringBuilder buf = new StringBuilder(100);
        if (strings.length > 1) {
            for (int i = 0; i < strings.length - 1; i++) {
                if (buf.length() != 0) {
                    buf.append('.');
                }
                buf.append(strings[i]);
            }
            filePrefix = buf.toString();
            extention = strings[strings.length - 1];
        } else {
            filePrefix = name;
            extention = "";
        }
    } else {
        filePrefix = "none";
        extention = "";
    }
    File inputFile = null;
    File outputFile = null;
    try {
        inputFile = File.createTempFile("cmdextin_" + filePrefix + "_", StringUtil.isNotBlank(extention) ? "." + extention : extention, tempDir);
        String ext;
        if (outputExtension == null) {
            if (StringUtil.isNotBlank(extention)) {
                ext = "." + extention;
            } else {
                ext = extention;
            }
        } else {
            ext = outputExtension;
        }
        outputFile = File.createTempFile("cmdextout_" + filePrefix + "_", ext, tempDir);
        // store to a file
        CopyUtil.copy(in, inputFile);
        executeCommand(inputFile, outputFile);
        final ExtractData extractData = new ExtractData(new String(FileUtil.readBytes(outputFile), outputEncoding));
        if (StringUtil.isNotBlank(resourceName)) {
            extractData.putValues("resourceName", new String[] { resourceName });
        }
        return extractData;
    } catch (final IOException e) {
        throw new ExtractException("Could not extract a content.", e);
    } finally {
        if (inputFile != null && !inputFile.delete()) {
            logger.info("Failed to delete " + inputFile.getAbsolutePath());
        }
        if (outputFile != null && !outputFile.delete()) {
            logger.info("Failed to delete " + outputFile.getAbsolutePath());
        }
    }
}
Also used : ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) IOException(java.io.IOException) File(java.io.File)

Example 12 with ExtractException

use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.

the class EmlExtractor method getText.

/* (non-Javadoc)
     * @see org.codelibs.robot.extractor.Extractor#getText(java.io.InputStream, java.util.Map)
     */
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    final Properties props = new Properties(mailProperties);
    if (params != null) {
        for (final Map.Entry<String, String> entry : params.entrySet()) {
            props.put(entry.getKey(), entry.getValue());
        }
    }
    try {
        final Session mailSession = Session.getDefaultInstance(props, null);
        final MimeMessage message = new MimeMessage(mailSession, in);
        final String content = getBodyText(message);
        final ExtractData data = new ExtractData(content != null ? content : StringUtil.EMPTY);
        final Enumeration<Header> headers = message.getAllHeaders();
        while (headers.hasMoreElements()) {
            final Header header = headers.nextElement();
            data.putValue(header.getName(), header.getValue());
        }
        putValue(data, "Content-ID", message.getContentID());
        putValue(data, "Content-Language", message.getContentLanguage());
        putValue(data, "Content-MD5", message.getContentMD5());
        putValue(data, "Description", message.getDescription());
        putValue(data, "Disposition", message.getDisposition());
        putValue(data, "Encoding", message.getEncoding());
        putValue(data, "File-Name", message.getFileName());
        putValue(data, "From", message.getFrom());
        putValue(data, "Line-Count", message.getLineCount());
        putValue(data, "Message-ID", message.getMessageID());
        putValue(data, "Message-Number", message.getMessageNumber());
        putValue(data, "Received-Date", getReceivedDate(message));
        putValue(data, "Reply-To", message.getReplyTo());
        putValue(data, "Sender", message.getSender());
        putValue(data, "Sent-Date", message.getSentDate());
        putValue(data, "Size", message.getSize());
        putValue(data, "Subject", message.getSubject());
        putValue(data, "Receipients", message.getAllRecipients());
        putValue(data, "To", message.getRecipients(Message.RecipientType.TO));
        putValue(data, "Cc", message.getRecipients(Message.RecipientType.CC));
        putValue(data, "Bcc", message.getRecipients(Message.RecipientType.BCC));
        return data;
    } catch (final MessagingException e) {
        throw new ExtractException(e);
    }
}
Also used : ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) Header(javax.mail.Header) MimeMessage(javax.mail.internet.MimeMessage) MessagingException(javax.mail.MessagingException) Properties(java.util.Properties) HashMap(java.util.HashMap) Map(java.util.Map) Session(javax.mail.Session)

Example 13 with ExtractException

use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.

the class JodExtractor method getOutputContent.

protected String getOutputContent(final File outputFile, final String outExt) {
    final Extractor extractor = getExtractor(outExt);
    if (extractor != null) {
        final Map<String, String> params = new HashMap<>();
        params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, outputFile.getName());
        FileInputStream in = null;
        try {
            in = new FileInputStream(outputFile);
            final ExtractData extractData = extractor.getText(in, params);
            return extractData.getContent();
        } catch (final FileNotFoundException e) {
            throw new ExtractException("Could not open " + outputFile.getAbsolutePath(), e);
        } finally {
            CloseableUtil.closeQuietly(in);
        }
    }
    try {
        return new String(FileUtil.readBytes(outputFile), outputEncoding);
    } catch (final UnsupportedEncodingException e) {
        return new String(FileUtil.readBytes(outputFile), Constants.UTF_8_CHARSET);
    }
}
Also used : ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) HashMap(java.util.HashMap) FileNotFoundException(java.io.FileNotFoundException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Extractor(org.codelibs.fess.crawler.extractor.Extractor) FileInputStream(java.io.FileInputStream)

Example 14 with ExtractException

use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.

the class JodExtractor method getText.

/*
     * (non-Javadoc)
     *
     * @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
     * java.util.Map)
     */
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("in is null.");
    }
    final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
    String extension;
    String filePrefix;
    if (StringUtil.isNotBlank(resourceName)) {
        final String name = getFileName(resourceName);
        final String[] strings = name.split("\\.");
        final StringBuilder buf = new StringBuilder(100);
        if (strings.length > 1) {
            for (int i = 0; i < strings.length - 1; i++) {
                if (buf.length() != 0) {
                    buf.append('.');
                }
                buf.append(strings[i]);
            }
            filePrefix = buf.toString();
            extension = strings[strings.length - 1];
        } else {
            filePrefix = name;
            extension = "";
        }
    } else {
        filePrefix = "none";
        extension = "";
    }
    File inputFile = null;
    File outputFile = null;
    try {
        inputFile = File.createTempFile("jodextin_" + filePrefix + "_", StringUtil.isNotBlank(extension) ? "." + extension : extension, tempDir);
        final String outExt = getOutputExtension(extension);
        outputFile = File.createTempFile("cmdextout_" + filePrefix + "_", "." + outExt, tempDir);
        // store to a file
        CopyUtil.copy(in, inputFile);
        final OfficeDocumentConverter converter = new OfficeDocumentConverter(officeManager);
        converter.convert(inputFile, outputFile);
        final ExtractData extractData = new ExtractData(getOutputContent(outputFile, outExt));
        if (StringUtil.isNotBlank(resourceName)) {
            extractData.putValues("resourceName", new String[] { resourceName });
        }
        return extractData;
    } catch (final IOException e) {
        throw new ExtractException("Could not extract a content.", e);
    } finally {
        if (inputFile != null && !inputFile.delete()) {
            logger.info("Failed to delete " + inputFile.getAbsolutePath());
        }
        if (outputFile != null && !outputFile.delete()) {
            logger.info("Failed to delete " + outputFile.getAbsolutePath());
        }
    }
}
Also used : ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) IOException(java.io.IOException) File(java.io.File) OfficeDocumentConverter(org.artofsolving.jodconverter.OfficeDocumentConverter)

Example 15 with ExtractException

use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.

the class LhaExtractor method getText.

@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }
    final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
    final ExtractorFactory extractorFactory = getExtractorFactory();
    final StringBuilder buf = new StringBuilder(1000);
    File tempFile = null;
    LhaFile lhaFile = null;
    try {
        tempFile = File.createTempFile("crawler-", ".lzh");
        try (FileOutputStream fos = new FileOutputStream(tempFile)) {
            CopyUtil.copy(in, fos);
        }
        lhaFile = new LhaFile(tempFile);
        @SuppressWarnings("unchecked") final Enumeration<LhaHeader> entries = lhaFile.entries();
        long contentSize = 0;
        while (entries.hasMoreElements()) {
            final LhaHeader head = entries.nextElement();
            contentSize += head.getOriginalSize();
            if (maxContentSize != -1 && contentSize > maxContentSize) {
                throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
            }
            final String filename = head.getPath();
            final String mimeType = mimeTypeHelper.getContentType(null, filename);
            if (mimeType != null) {
                final Extractor extractor = extractorFactory.getExtractor(mimeType);
                if (extractor != null) {
                    InputStream is = null;
                    try {
                        is = lhaFile.getInputStream(head);
                        final Map<String, String> map = new HashMap<>();
                        map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
                        buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
                        buf.append('\n');
                    } catch (final Exception e) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("Exception in an internal extractor.", e);
                        }
                    } finally {
                        CloseableUtil.closeQuietly(is);
                    }
                }
            }
        }
    } catch (final MaxLengthExceededException e) {
        throw e;
    } catch (final Exception e) {
        throw new ExtractException("Could not extract a content.", e);
    } finally {
        if (lhaFile != null) {
            try {
                lhaFile.close();
            } catch (final IOException e) {
            // ignore
            }
        }
        if (tempFile != null && !tempFile.delete()) {
            logger.warn("Failed to delete " + tempFile.getAbsolutePath());
        }
    }
    return new ExtractData(buf.toString().trim());
}
Also used : ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) HashMap(java.util.HashMap) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) IgnoreCloseInputStream(org.codelibs.fess.crawler.util.IgnoreCloseInputStream) InputStream(java.io.InputStream) LhaFile(jp.gr.java_conf.dangan.util.lha.LhaFile) IOException(java.io.IOException) IOException(java.io.IOException) ExtractException(org.codelibs.fess.crawler.exception.ExtractException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) LhaHeader(jp.gr.java_conf.dangan.util.lha.LhaHeader) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) FileOutputStream(java.io.FileOutputStream) Extractor(org.codelibs.fess.crawler.extractor.Extractor) LhaFile(jp.gr.java_conf.dangan.util.lha.LhaFile) File(java.io.File) IgnoreCloseInputStream(org.codelibs.fess.crawler.util.IgnoreCloseInputStream)

Aggregations

ExtractException (org.codelibs.fess.crawler.exception.ExtractException)16 ExtractData (org.codelibs.fess.crawler.entity.ExtractData)11 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)10 IOException (java.io.IOException)9 File (java.io.File)5 HashMap (java.util.HashMap)5 Extractor (org.codelibs.fess.crawler.extractor.Extractor)5 BufferedInputStream (java.io.BufferedInputStream)4 InputStream (java.io.InputStream)3 BufferedReader (java.io.BufferedReader)2 BufferedWriter (java.io.BufferedWriter)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 InputStreamReader (java.io.InputStreamReader)2 OutputStreamWriter (java.io.OutputStreamWriter)2 Reader (java.io.Reader)2 Map (java.util.Map)2 MessagingException (javax.mail.MessagingException)2 ArchiveInputStream (org.apache.commons.compress.archivers.ArchiveInputStream)2