use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class MSDocConverter method conversionImplementation.
/**
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException {
org.apache.poi.hwpf.extractor.WordExtractor ex = new WordExtractor(input);
String[] ps = ex.getParagraphText();
input.close();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < ps.length; i++) {
sb.append(WordExtractor.stripFields(ps[i]).trim());
sb.append('\n');
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
textdoc.setText(sb.toString());
ex.close();
return textdoc;
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class DefaultWebCrawl method convertContent.
/**
* convert and record a downloaded item, given the item and its source URL.
*
* @param item
* item to convert
* @param link
* link representing the original/source
* @throws IOException
* on err
* @throws ConfigException
* on err
* @throws NoSuchAlgorithmException
* an error that never happens
*/
protected void convertContent(File item, HyperLink link) throws IOException, ConfigException, NoSuchAlgorithmException {
if (item == null || link == null) {
throw new IOException("Bad data - null values for file and link...");
}
if (converter == null && listener != null) {
log.debug("Link {} was saved to {}", link.getAbsoluteURL(), item.getAbsolutePath());
listener.collected(item);
return;
}
/**
* Convert the item.
*/
ConvertedDocument doc = null;
if (item.exists()) {
// record with a success state.
doc = converter.convert(item);
if (doc != null) {
if (doc.textpath == null) {
log.error("Expecting the content to be non-null for {}", doc.getFilepath());
return;
}
//doc.setDefaultID();
doc.setId(link.getId());
doc.addSourceURL(link.getAbsoluteURL(), link.getReferrer());
// This path must already exist
doc.saveBuffer(new File(doc.textpath));
if (listener != null) {
listener.collected(doc, item.getAbsolutePath());
}
} else {
log.error("Document was not converted, FILE={}", item);
}
}
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class DefaultSharepointCrawl method convertContent.
/**
* TODO: redesign so both Web crawl and Sharepoint crawl share this common routine:
* copy copy copy -- see DefaultWebCrawl
*
* convert and record a downloaded item, given the item and its source URL.
* @param item item
* @param link original URL where item was found
* @throws IOException on err
* @throws ConfigException on err
* @throws NoSuchAlgorithmException on err
*/
protected void convertContent(File item, HyperLink link) throws IOException, ConfigException, NoSuchAlgorithmException {
if (item == null || link == null) {
throw new IOException("Bad data - null values for file and link...");
}
if (converter == null && listener != null) {
log.debug("Link {} was saved to {}", link.getAbsoluteURL(), item.getAbsolutePath());
listener.collected(item);
return;
}
/**
* Convert the item.
*/
ConvertedDocument doc = null;
if (item.exists()) {
// record with a success state.
doc = converter.convert(item);
if (doc != null) {
doc.setDefaultID();
doc.addSourceURL(link.getAbsoluteURL(), link.getReferrer());
// This path must already exist
doc.saveBuffer(new File(doc.textpath));
if (listener != null) {
listener.collected(doc, item.getAbsolutePath());
}
} else {
log.error("Document was not converted, FILE={}", item);
}
}
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class OLEMessageConverter method conversionImplementation.
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
ConvertedDocument msgDoc = new ConvertedDocument(doc);
try {
MAPIMessage msg = new MAPIMessage(in);
// If your message is Latin-1 text... there is no real easy way to get bytes of raw message text
// to ensure it is UTF-8
// TextTranscodingConverter.setTextAndEncoding(doc, msg.getM);
// By default this may be UTF-8 text.
msgDoc.setText(msg.getTextBody());
/* Would prefer not to set encoding here without knowing or attempting to derive it properly */
msgDoc.setEncoding(ConvertedDocument.OUTPUT_ENCODING);
AttachmentChunks[] chunks = msg.getAttachmentFiles();
for (AttachmentChunks c : chunks) {
Content child = new Content();
child.id = getAttachmentName(c.attachLongFileName, c.attachFileName);
child.content = c.attachData.getValue();
msgDoc.addRawChild(child);
}
// Get a subject line.
try {
msgDoc.addTitle(msg.getSubject());
} catch (ChunkNotFoundException err) {
msgDoc.addTitle("(MIME error: unable to get subject)");
}
// Get a date line.
try {
msgDoc.addCreateDate(msg.getMessageDate());
} catch (ChunkNotFoundException err) {
//
}
// Get author.
try {
msgDoc.addAuthor(msg.getDisplayFrom());
} catch (ChunkNotFoundException err) {
msgDoc.addAuthor("(MIME error: unable to get sender)");
}
return msgDoc;
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
in.close();
}
}
use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.
the class PDFConverter method convert.
/**
* Implementation is informed by PDFBox authors.
*
* @param doc
* @return
* @throws IOException
*/
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Adapted from LucenePDFDocument.java from PDFBox lucene project
*
* This class is used to create a document for the lucene search engine.
* This should easily plug into the IndexHTML or IndexFiles that comes
* with the lucene project. This class will populate the following
* fields.
* <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr>
* <tr>
* <td>path</td> <td>File system path if loaded from a file</td> </tr>
* <tr>
* <td>url</td> <td>URL to PDF document</td> </tr> <tr>
* <td>contents</td>
* <td>Entire contents of PDF document, indexed but not stored</td>
* </tr>
* <tr> <td>summary</td> <td>First 500 characters of content</td> </tr>
* <tr>
* <td>modified</td> <td>The modified date/time according to the url or
* path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the
* Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF
* meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF
* meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF
* meta-data if available</td> </tr> <tr> <td>ModificationDate</td>
* <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td>
* <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td>
* <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td>
* <td>From PDF meta-data if available</td> </tr> <tr>
* <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr>
* </table>
*
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @version $Revision: 1.23 $
*
* @throws IOException If there is an error parsing the document.
*/
PDDocument pdfDocument = null;
ConvertedDocument textdoc = new ConvertedDocument(doc);
try {
pdfDocument = PDDocument.load(doc);
if (pdfDocument.isEncrypted()) {
//Just try using the default password and move on
// Even if the doc is encrypted, apparently you can try. Throw exception if it fails.
textdoc.addProperty("encrypted", "YES");
}
//create a writer where to append the text content.
StringWriter writer = new StringWriter();
stripper.resetEngine();
stripper.writeText(pdfDocument, writer);
PDDocumentInformation info = pdfDocument.getDocumentInformation();
if (info != null) {
textdoc.addAuthor(info.getAuthor());
try {
textdoc.addCreateDate(info.getCreationDate());
} catch (IOException io) {
//ignore, bad date but continue with indexing
}
textdoc.addProperty("creator_tool", info.getCreator());
textdoc.addProperty("keywords", info.getKeywords());
/* try {
metadata.add("ModificationDate", info.getModificationDate());
} catch (IOException io) {
//ignore, bad date but continue with indexing
} */
//metadata.add("Producer", info.getProducer());
textdoc.addProperty("subject", info.getSubject());
String ttl = info.getTitle();
if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
ttl = textdoc.filename;
}
textdoc.addTitle(ttl);
// metadata.add("Trapped", info.getTrapped());
// TODO: Character set is what?
textdoc.setEncoding("UTF-8");
}
// Note: the buffer to string operation is costless;
// the char array value of the writer buffer and the content string
// is shared as long as the buffer content is not modified, which will
// not occur here.
textdoc.setText(writer.getBuffer().toString());
return textdoc;
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
}
}
Aggregations