use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class CommandExtractor method getText.
/*
* (non-Javadoc)
*
* @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
* java.util.Map)
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
String extention;
String filePrefix;
if (StringUtil.isNotBlank(resourceName)) {
final String name = getFileName(resourceName);
final String[] strings = name.split("\\.");
final StringBuilder buf = new StringBuilder(100);
if (strings.length > 1) {
for (int i = 0; i < strings.length - 1; i++) {
if (buf.length() != 0) {
buf.append('.');
}
buf.append(strings[i]);
}
filePrefix = buf.toString();
extention = strings[strings.length - 1];
} else {
filePrefix = name;
extention = "";
}
} else {
filePrefix = "none";
extention = "";
}
File inputFile = null;
File outputFile = null;
try {
inputFile = File.createTempFile("cmdextin_" + filePrefix + "_", StringUtil.isNotBlank(extention) ? "." + extention : extention, tempDir);
String ext;
if (outputExtension == null) {
if (StringUtil.isNotBlank(extention)) {
ext = "." + extention;
} else {
ext = extention;
}
} else {
ext = outputExtension;
}
outputFile = File.createTempFile("cmdextout_" + filePrefix + "_", ext, tempDir);
// store to a file
CopyUtil.copy(in, inputFile);
executeCommand(inputFile, outputFile);
final ExtractData extractData = new ExtractData(new String(FileUtil.readBytes(outputFile), outputEncoding));
if (StringUtil.isNotBlank(resourceName)) {
extractData.putValues("resourceName", new String[] { resourceName });
}
return extractData;
} catch (final IOException e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (inputFile != null && !inputFile.delete()) {
logger.info("Failed to delete " + inputFile.getAbsolutePath());
}
if (outputFile != null && !outputFile.delete()) {
logger.info("Failed to delete " + outputFile.getAbsolutePath());
}
}
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class EmlExtractor method getText.
/* (non-Javadoc)
* @see org.codelibs.robot.extractor.Extractor#getText(java.io.InputStream, java.util.Map)
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
final Properties props = new Properties(mailProperties);
if (params != null) {
for (final Map.Entry<String, String> entry : params.entrySet()) {
props.put(entry.getKey(), entry.getValue());
}
}
try {
final Session mailSession = Session.getDefaultInstance(props, null);
final MimeMessage message = new MimeMessage(mailSession, in);
final String content = getBodyText(message);
final ExtractData data = new ExtractData(content != null ? content : StringUtil.EMPTY);
final Enumeration<Header> headers = message.getAllHeaders();
while (headers.hasMoreElements()) {
final Header header = headers.nextElement();
data.putValue(header.getName(), header.getValue());
}
putValue(data, "Content-ID", message.getContentID());
putValue(data, "Content-Language", message.getContentLanguage());
putValue(data, "Content-MD5", message.getContentMD5());
putValue(data, "Description", message.getDescription());
putValue(data, "Disposition", message.getDisposition());
putValue(data, "Encoding", message.getEncoding());
putValue(data, "File-Name", message.getFileName());
putValue(data, "From", message.getFrom());
putValue(data, "Line-Count", message.getLineCount());
putValue(data, "Message-ID", message.getMessageID());
putValue(data, "Message-Number", message.getMessageNumber());
putValue(data, "Received-Date", getReceivedDate(message));
putValue(data, "Reply-To", message.getReplyTo());
putValue(data, "Sender", message.getSender());
putValue(data, "Sent-Date", message.getSentDate());
putValue(data, "Size", message.getSize());
putValue(data, "Subject", message.getSubject());
putValue(data, "Receipients", message.getAllRecipients());
putValue(data, "To", message.getRecipients(Message.RecipientType.TO));
putValue(data, "Cc", message.getRecipients(Message.RecipientType.CC));
putValue(data, "Bcc", message.getRecipients(Message.RecipientType.BCC));
return data;
} catch (final MessagingException e) {
throw new ExtractException(e);
}
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class JodExtractor method getOutputContent.
protected String getOutputContent(final File outputFile, final String outExt) {
final Extractor extractor = getExtractor(outExt);
if (extractor != null) {
final Map<String, String> params = new HashMap<>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, outputFile.getName());
FileInputStream in = null;
try {
in = new FileInputStream(outputFile);
final ExtractData extractData = extractor.getText(in, params);
return extractData.getContent();
} catch (final FileNotFoundException e) {
throw new ExtractException("Could not open " + outputFile.getAbsolutePath(), e);
} finally {
CloseableUtil.closeQuietly(in);
}
}
try {
return new String(FileUtil.readBytes(outputFile), outputEncoding);
} catch (final UnsupportedEncodingException e) {
return new String(FileUtil.readBytes(outputFile), Constants.UTF_8_CHARSET);
}
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class JodExtractor method getText.
/*
* (non-Javadoc)
*
* @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
* java.util.Map)
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("in is null.");
}
final String resourceName = params == null ? null : params.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
String extension;
String filePrefix;
if (StringUtil.isNotBlank(resourceName)) {
final String name = getFileName(resourceName);
final String[] strings = name.split("\\.");
final StringBuilder buf = new StringBuilder(100);
if (strings.length > 1) {
for (int i = 0; i < strings.length - 1; i++) {
if (buf.length() != 0) {
buf.append('.');
}
buf.append(strings[i]);
}
filePrefix = buf.toString();
extension = strings[strings.length - 1];
} else {
filePrefix = name;
extension = "";
}
} else {
filePrefix = "none";
extension = "";
}
File inputFile = null;
File outputFile = null;
try {
inputFile = File.createTempFile("jodextin_" + filePrefix + "_", StringUtil.isNotBlank(extension) ? "." + extension : extension, tempDir);
final String outExt = getOutputExtension(extension);
outputFile = File.createTempFile("cmdextout_" + filePrefix + "_", "." + outExt, tempDir);
// store to a file
CopyUtil.copy(in, inputFile);
final OfficeDocumentConverter converter = new OfficeDocumentConverter(officeManager);
converter.convert(inputFile, outputFile);
final ExtractData extractData = new ExtractData(getOutputContent(outputFile, outExt));
if (StringUtil.isNotBlank(resourceName)) {
extractData.putValues("resourceName", new String[] { resourceName });
}
return extractData;
} catch (final IOException e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (inputFile != null && !inputFile.delete()) {
logger.info("Failed to delete " + inputFile.getAbsolutePath());
}
if (outputFile != null && !outputFile.delete()) {
logger.info("Failed to delete " + outputFile.getAbsolutePath());
}
}
}
use of org.codelibs.fess.crawler.exception.ExtractException in project fess-crawler by codelibs.
the class LhaExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
File tempFile = null;
LhaFile lhaFile = null;
try {
tempFile = File.createTempFile("crawler-", ".lzh");
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
CopyUtil.copy(in, fos);
}
lhaFile = new LhaFile(tempFile);
@SuppressWarnings("unchecked") final Enumeration<LhaHeader> entries = lhaFile.entries();
long contentSize = 0;
while (entries.hasMoreElements()) {
final LhaHeader head = entries.nextElement();
contentSize += head.getOriginalSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = head.getPath();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
InputStream is = null;
try {
is = lhaFile.getInputStream(head);
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
} finally {
CloseableUtil.closeQuietly(is);
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (lhaFile != null) {
try {
lhaFile.close();
} catch (final IOException e) {
// ignore
}
}
if (tempFile != null && !tempFile.delete()) {
logger.warn("Failed to delete " + tempFile.getAbsolutePath());
}
}
return new ExtractData(buf.toString().trim());
}
Aggregations