use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class WebArchiveConverter method conversionImplementation.
/**
* Convert MHT or .webarchive file to pure text.
* Alternatively, export "archive" exploded on disk and then convert all children items.
* See MessageConverter base and ArchiveNavigator solutions for that.
*
* @param in stream
* @param doc original file
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
TikaHTMLConverter htmlParser = new TikaHTMLConverter(false);
DefaultConverter objectParser = new DefaultConverter();
ConvertedDocument d = super.conversionImplementation(in, doc);
d.is_webArchive = true;
if (!d.hasRawChildren()) {
return d;
}
StringBuilder buf = new StringBuilder();
for (Content binary : d.getRawChildren()) {
logger.info("{} {} {}", d.id, binary.id, binary.mimeType);
if (binary.mimeType == null) {
continue;
}
if ("application/octet-stream".equalsIgnoreCase(binary.mimeType)) {
ConvertedDocument obj = objectParser.convert(TikaInputStream.get(binary.content));
if (obj != null && obj.hasText() && !isWebScript(obj.getText())) {
buf.append(obj.getText());
buf.append("\n==================\n");
}
} else if (binary.mimeType.startsWith("text/html")) {
ConvertedDocument htmlDoc = htmlParser.convert(TikaInputStream.get(binary.content));
if (htmlDoc != null && htmlDoc.hasText() && !isWebScript(htmlDoc.getText())) {
// Filter out HTML crap -- comments, javascript, etc. that comes through as octet-stream in these archives.
buf.append(htmlDoc.getText());
buf.append("\n==================\n");
}
} else if (binary.mimeType.startsWith("image")) {
buf.append(String.format("\n[Image: %s type='%s'] ", binary.id, binary.mimeType));
}
}
if (d.hasText()) {
d.setText(d.getText() + "\n\n==================\n\n" + buf.toString());
} else {
d.setText(buf.toString());
}
return d;
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class Decomposer method main.
public static void main(String[] args) {
gnu.getopt.Getopt opts = new gnu.getopt.Getopt("Decomposer", args, "hei:o:");
String input = null;
String output = null;
boolean embed = false;
try {
int c;
while ((c = opts.getopt()) != -1) {
switch(c) {
case 'i':
input = opts.getOptarg();
break;
case 'o':
output = opts.getOptarg();
break;
case 'e':
embed = true;
System.out.println("Saving conversions to Input folder. Output folder will be ignored.");
break;
default:
Decomposer.usage();
System.exit(1);
}
}
} catch (Exception err) {
Decomposer.usage();
System.exit(1);
}
EmbeddedContentConverter conv = new EmbeddedContentConverter(0x200000);
ConvertedDocument d;
try {
d = conv.convert(new File(input));
System.out.println("Found Doc:" + d.getFilepath());
} catch (IOException e) {
e.printStackTrace();
}
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class TextTranscodingConverter method conversionImplementation.
/**
* A converter that tries to get a decent encoding ASCII, UTF-8 or other,
* and then the buffer converted or not.
*
* IF ASCII OR UTF-8 accept file as is, do not convert, alter buffer...
* ELSE file must be read in and converted.
*
* CAVEAT: If file is short and low-confidence for encoding detection ALSO
* do not convert. Treat as a plain text file.
*/
@Override
protected ConvertedDocument conversionImplementation(java.io.InputStream in, java.io.File doc) throws IOException {
ConvertedDocument textdoc = new ConvertedDocument(doc);
byte[] data = null;
if (in != null) {
// Get byte data from input stream or file
if (doc != null) {
data = FileUtility.readBytesFrom(doc);
} else {
data = IOUtils.toByteArray(in);
}
in.close();
}
// Encoding heuristics here.....
//
// Objective: mark small plain text payloads with unknown character set
// as not worthy of conversion. Leave them as plain/text
// indeed they might even be straight Unicode
//
// Test for ASCII only first, otherwise try to detect the best charset for the text
//
textdoc.is_plaintext = true;
boolean is_ascii = TextUtils.isASCII(data);
if (is_ascii) {
textdoc.do_convert = false;
textdoc.setEncoding("ASCII");
textdoc.setText(new String(data));
} else {
chardet.setText(data);
CharsetMatch cs = chardet.detect();
if (ConvertedDocument.OUTPUT_ENCODING.equalsIgnoreCase(cs.getName())) {
textdoc.do_convert = false;
} else if (data.length < IGNORE_THRESHOLD_SIZE && cs.getConfidence() < IGNORE_THRESHOLD_CONF) {
textdoc.do_convert = false;
}
textdoc.setEncoding(cs.getName());
textdoc.setText(new String(data, cs.getName()));
}
return textdoc;
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class MessageConverter method convertMimeMessage.
/**
* Convert the MIME Message with or without the File doc.
* -- live email capture from a mailbox: you have the MimeMessage; there is no File object
* -- email capture from a filesystem: you retrieved the MimeMessage from a File object
*
* @param msg javamail Message obj
* @param doc converted doc for given message
* @return doc conversion, likely a parent document with 1 or more child attachments
* @throws MessagingException on err
* @throws IOException on err
*/
public ConvertedDocument convertMimeMessage(Message msg, File doc) throws MessagingException, IOException {
ConvertedDocument parentMsgDoc = new ConvertedDocument(doc);
parentMsgDoc.is_RFC822_attachment = true;
//parentMsgDoc.setEncoding(parseCharset(msg.getContentType()));
setMailAttributes(parentMsgDoc, msg);
StringBuilder rawText = new StringBuilder();
// Since content is taken from file system, use file name
String messageFilePrefix = (doc != null ? FilenameUtils.getBaseName(doc.getName()) : parentMsgDoc.id);
// Find all attachments and plain text.
parseMessage(msg, parentMsgDoc, rawText, messageFilePrefix);
parentMsgDoc.setText(rawText.toString());
return parentMsgDoc;
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class EmbeddedContentConverter method conversionImplementation.
/**
* Convert Embedded documents in the supported types to a folder of the embedded items.
* Trivial embedded icons and other components will not be extracted
*
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
ConvertedDocument compoundDoc = super.conversionImplementation(in, doc);
String ext = FilenameUtils.getExtension(doc.getName());
if (!isSupported(ext)) {
// Not really compound by our standards here.
return compoundDoc;
}
ParserContainerExtractor extractor = new ParserContainerExtractor();
EmbeddedObjectExtractor objExtractor = new EmbeddedObjectExtractor(compoundDoc, true);
TikaInputStream tikaStream = null;
try {
tikaStream = TikaInputStream.get(doc.toPath());
extractor.extract(tikaStream, extractor, objExtractor);
compoundDoc.is_converted = true;
if (compoundDoc.hasRawChildren()) {
// Create text buffer for this compound document here.
// If raw children should be post-processed by some other means, that is up to caller.
// This parent document at least contains a complete text representation of the content in the original doc.
StringBuilder completeText = new StringBuilder();
completeText.append(compoundDoc.getText());
completeText.append("\n==Embedded Objects==\n");
completeText.append(renderText(compoundDoc.getRawChildren()));
compoundDoc.setText(completeText.toString());
compoundDoc.is_converted = true;
return compoundDoc;
} else {
// Try the simple approach.
return compoundDoc;
}
} catch (Exception e) {
throw new IOException("Stream parsing problem", e);
} finally {
tikaStream.close();
}
}
Aggregations