use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class DefaultConverter method conversionImplementation.
/**
* Common implementation -- take an input stream and return a ConvertedDoc;
*
* @param input stream for raw file
* @param doc raw file
* @return converted doc
* @throws IOException if underlying Tika parser/writer had an IO problem, an parser
* problem, or MAX_TEXT_SIZE is reached.
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException {
Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler(maxBuffer);
try {
parser.parse(input, handler, metadata, ctx);
} catch (NoClassDefFoundError classErr) {
throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
input.close();
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED));
textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR));
// v1.5: until this version this blank line reducer was in place.
// Using Java6 it appeared to cause StackOverflow when it encountered a document hundreds of \n in a row.
// Eg.., a Spreadsheet doc converted to text may have thousands of empty lines following the last data row.
// TextUtils.reduce_line_breaks(txt)
String t = handler.toString();
if (t != null) {
if (textdoc.filename != null && FileUtility.isSpreadsheet(textdoc.filename)) {
textdoc.setText(t.trim());
} else {
textdoc.setText(TextUtils.reduce_line_breaks(t));
}
}
return textdoc;
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class EmbeddedContentConverter method renderText.
/**
*
* @param childObjects children
* @return text assembled from children
*/
private String renderText(List<Content> childObjects) {
StringBuilder buf = new StringBuilder();
for (Content c : childObjects) {
buf.append(String.format("\n[Embedded: %s; %s]\n", c.id, c.tikaMediatype.toString()));
try {
// NOTE: To do this well, you may have to write bytes to disk as a valid file name
// And let Tika convert in full.
ConvertedDocument text = conv.conversionImplementation(TikaInputStream.get(c.content, c.tikaMetadata), null);
buf.append(text.getText());
} catch (IOException ioe) {
buf.append("Unconvertable content");
}
buf.append("\n");
}
return buf.toString();
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class ImageMetadataConverter method conversionImplementation.
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
ConvertedDocument imgDoc = new ConvertedDocument(doc);
imgDoc.setEncoding(ConvertedDocument.OUTPUT_ENCODING);
imgDoc.is_plaintext = false;
Metadata metadata = new Metadata();
StringBuilder buf = new StringBuilder();
BodyContentHandler handler = new BodyContentHandler();
String type = "Image";
String objName = null;
if (doc != null) {
objName = doc.getName();
String ext = FilenameUtils.getExtension(doc.getName().toLowerCase());
if ("jpg".equals(ext) || "jpeg".equals(ext)) {
type = "Photo";
}
}
try {
parser.parse(in, handler, metadata, ctx);
if (objName == null) {
objName = metadata.get(Metadata.RESOURCE_NAME_KEY);
}
// What is the signal to generate any text buffer at all?
// Is it worth puttting out a full EXIF dump for a JPEG?
//
int mdCount = metadata.names().length;
if (mdCount == 0) {
// No meaningful text or other metadata.
return null;
}
buf.append("Image Specifications\n===================\n");
List<String> metaKeys = Arrays.asList(metadata.names());
Collections.sort(metaKeys);
for (String key : metaKeys) {
if (this.emitMinimalText && !isUseful(key)) {
continue;
}
String val = metadata.get(key);
if (StringUtils.isBlank(val)) {
val = "(N/A)";
}
buf.append(String.format("%s:\t%s\n", key, val));
}
// Title
imgDoc.addTitle(String.format("%s: %s", type, objName));
// Author
imgDoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR));
// Date
imgDoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED));
// Geographic
String lat = metadata.get(TikaCoreProperties.LATITUDE);
String lon = metadata.get(TikaCoreProperties.LONGITUDE);
// Location if available.
if (lat != null && lon != null) {
logger.info("Found a location LAT={} LON={}", lat, lon);
// imgDoc.addProperty("location", String.format("%2.8f,%3.8f", ));
imgDoc.addUserProperty("location", String.format("%s, %s", lat, lon));
try {
LatLon yx = GeodeticUtility.parseLatLon(lat, lon);
buf.append("Location:\t" + formatCoord(yx) + "\n");
} catch (ParseException parseErr) {
//
}
}
// EXIF and other text content
imgDoc.setText(buf.toString());
return imgDoc;
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
in.close();
}
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class DefaultMailCrawl method handleConversion.
/**
* Email parser, converter, recorder. This routine handles one message that
* may have a number of attachments (children)
*
* IOException is logged if handling of children documents+conversions fails.
* TODO: handleConversion should throw IOException or use listener to report errors for this document
*
* @param doc the doc
* @param filepath the filepath
*/
@Override
public void handleConversion(ConvertedDocument doc, String filepath) {
if (listener == null) {
// nothing to do.
return;
}
if (doc == null) {
log.debug("Item was not converted, FILE={}", filepath);
return;
}
try {
// Converted document is discovered, then enters this interface method.
//
// Parent doc will be ./A.eml
// Child Attachments will be ./A_eml/b.doc
//
listener.collected(doc, filepath);
if (doc.hasChildren()) {
//
for (ConvertedDocument child : doc.getChildren()) {
// This creates a new ID out of the parent doc id and the attachment filename.
String uniqueValue = String.format("%s,%s", doc.id, child.filename);
String _id = uniqueValue;
try {
_id = TextUtils.text_id(uniqueValue);
} catch (Exception err) {
log.error("hashing Error", err);
}
child.setId(_id);
// Record the child attachment.
listener.collected(child, child.filepath);
}
}
} catch (IOException err) {
log.error("Failed to record or manage the email message and/or its attachments, FILE={}", filepath);
}
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class TikaHTMLConverter method conversionImplementation.
/**
* a barebones HTML parser.
*
* <pre>
* TODO: mis-encoded HTML entities are not decoded
* properly. E.g., finding "–" (82xx range is dashes, quotes) for
* example, does not decode correctly unless the page encoding is declared as UTF-8.
* </pre>
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, File doc) throws IOException {
Metadata metadata = new Metadata();
HashMap<String, String> moreMetadata = new HashMap<>();
// HTML Conversion here is simply not resetting its internal buffers
// Its just accumulating and error out when it reaches MAX
ContentHandler handler = new BodyContentHandler(maxHTMLDocumentSize);
BoilerpipeContentHandler scrubbingHandler = null;
if (scrubHTMLArticle) {
scrubbingHandler = new BoilerpipeContentHandler(handler);
}
try {
parser.parse(input, (scrubHTMLArticle ? scrubbingHandler : handler), metadata, new ParseContext());
if (doc != null) {
parseHTMLMetadata(doc, moreMetadata);
}
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
input.close();
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
String text = null;
if (scrubHTMLArticle) {
text = scrubbingHandler.getTextDocument().getText(true, false);
} else {
text = handler.toString();
}
textdoc.setText(TextUtils.reduce_line_breaks(text));
// -- Improve CHAR SET encoding answer.
byte[] data = textdoc.buffer.getBytes();
if (TextUtils.isASCII(data)) {
textdoc.setEncoding("ASCII");
} else {
// Okay, okay... let Tika name whatever encoding it found or guessed
// at.
textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
}
// Indicate if we tried to filter the article at all.
//
textdoc.addProperty("filtered", scrubHTMLArticle);
textdoc.addProperty("converter", TikaHTMLConverter.class.getName());
if (!moreMetadata.isEmpty()) {
for (String k : moreMetadata.keySet()) {
textdoc.addUserProperty(k, moreMetadata.get(k));
}
}
return textdoc;
}
Aggregations