Search in sources :

Example 6 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class MSDocConverter method conversionImplementation.

/** 
     */
@Override
protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException {
    org.apache.poi.hwpf.extractor.WordExtractor ex = new WordExtractor(input);
    String[] ps = ex.getParagraphText();
    input.close();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < ps.length; i++) {
        sb.append(WordExtractor.stripFields(ps[i]).trim());
        sb.append('\n');
    }
    ConvertedDocument textdoc = new ConvertedDocument(doc);
    textdoc.setText(sb.toString());
    ex.close();
    return textdoc;
}
Also used : ConvertedDocument(org.opensextant.xtext.ConvertedDocument) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor)

Example 7 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class DefaultWebCrawl method convertContent.

/**
     * convert and record a downloaded item, given the item and its source URL.
     * 
     * @param item
     *            item to convert
     * @param link
     *            link representing the original/source
     * @throws IOException
     *             on err
     * @throws ConfigException
     *             on err
     * @throws NoSuchAlgorithmException
     *             an error that never happens
     */
protected void convertContent(File item, HyperLink link) throws IOException, ConfigException, NoSuchAlgorithmException {
    if (item == null || link == null) {
        throw new IOException("Bad data - null values for file and link...");
    }
    if (converter == null && listener != null) {
        log.debug("Link {} was saved to {}", link.getAbsoluteURL(), item.getAbsolutePath());
        listener.collected(item);
        return;
    }
    /**
         * Convert the item.
         */
    ConvertedDocument doc = null;
    if (item.exists()) {
        // record with a success state.
        doc = converter.convert(item);
        if (doc != null) {
            if (doc.textpath == null) {
                log.error("Expecting the content to be non-null for {}", doc.getFilepath());
                return;
            }
            //doc.setDefaultID();
            doc.setId(link.getId());
            doc.addSourceURL(link.getAbsoluteURL(), link.getReferrer());
            // This path must already exist
            doc.saveBuffer(new File(doc.textpath));
            if (listener != null) {
                listener.collected(doc, item.getAbsolutePath());
            }
        } else {
            log.error("Document was not converted, FILE={}", item);
        }
    }
}
Also used : IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) File(java.io.File)

Example 8 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class DefaultSharepointCrawl method convertContent.

/**
     * TODO: redesign so both Web crawl and Sharepoint crawl share this common routine:
     * copy copy copy -- see DefaultWebCrawl
     *
     * convert and record a downloaded item, given the item and its source URL.
     * @param item item
     * @param link original URL where item was found
     * @throws IOException on err
     * @throws ConfigException on err
     * @throws NoSuchAlgorithmException on err
     */
protected void convertContent(File item, HyperLink link) throws IOException, ConfigException, NoSuchAlgorithmException {
    if (item == null || link == null) {
        throw new IOException("Bad data - null values for file and link...");
    }
    if (converter == null && listener != null) {
        log.debug("Link {} was saved to {}", link.getAbsoluteURL(), item.getAbsolutePath());
        listener.collected(item);
        return;
    }
    /**
         *  Convert the item.
         */
    ConvertedDocument doc = null;
    if (item.exists()) {
        // record with a success state.
        doc = converter.convert(item);
        if (doc != null) {
            doc.setDefaultID();
            doc.addSourceURL(link.getAbsoluteURL(), link.getReferrer());
            // This path must already exist
            doc.saveBuffer(new File(doc.textpath));
            if (listener != null) {
                listener.collected(doc, item.getAbsolutePath());
            }
        } else {
            log.error("Document was not converted, FILE={}", item);
        }
    }
}
Also used : IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) File(java.io.File)

Example 9 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class OLEMessageConverter method conversionImplementation.

@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
    ConvertedDocument msgDoc = new ConvertedDocument(doc);
    try {
        MAPIMessage msg = new MAPIMessage(in);
        // If your message is Latin-1 text... there is no real easy way to get bytes of raw message text
        // to ensure it is UTF-8
        // TextTranscodingConverter.setTextAndEncoding(doc, msg.getM);
        // By default this may be UTF-8 text.
        msgDoc.setText(msg.getTextBody());
        /* Would prefer not to set encoding here without knowing  or attempting to derive it properly */
        msgDoc.setEncoding(ConvertedDocument.OUTPUT_ENCODING);
        AttachmentChunks[] chunks = msg.getAttachmentFiles();
        for (AttachmentChunks c : chunks) {
            Content child = new Content();
            child.id = getAttachmentName(c.attachLongFileName, c.attachFileName);
            child.content = c.attachData.getValue();
            msgDoc.addRawChild(child);
        }
        // Get a subject line.
        try {
            msgDoc.addTitle(msg.getSubject());
        } catch (ChunkNotFoundException err) {
            msgDoc.addTitle("(MIME error: unable to get subject)");
        }
        // Get a date line.
        try {
            msgDoc.addCreateDate(msg.getMessageDate());
        } catch (ChunkNotFoundException err) {
        // 
        }
        // Get author.
        try {
            msgDoc.addAuthor(msg.getDisplayFrom());
        } catch (ChunkNotFoundException err) {
            msgDoc.addAuthor("(MIME error: unable to get sender)");
        }
        return msgDoc;
    } catch (Exception xerr) {
        throw new IOException("Unable to parse content", xerr);
    } finally {
        in.close();
    }
}
Also used : ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException) MAPIMessage(org.apache.poi.hsmf.MAPIMessage) Content(org.opensextant.xtext.Content) IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks) IOException(java.io.IOException) ChunkNotFoundException(org.apache.poi.hsmf.exceptions.ChunkNotFoundException)

Example 10 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class PDFConverter method convert.

/**
     * Implementation is informed by PDFBox authors.
     *
     * @param doc
     * @return
     * @throws IOException
     */
@Override
public synchronized ConvertedDocument convert(java.io.File doc) throws IOException {
    /*
         * Licensed to the Apache Software Foundation (ASF) under one or more
         * contributor license agreements.  See the NOTICE file distributed with
         * this work for additional information regarding copyright ownership.
         * The ASF licenses this file to You under the Apache License, Version 2.0
         * (the "License"); you may not use this file except in compliance with
         * the License.  You may obtain a copy of the License at
         *
         *      http://www.apache.org/licenses/LICENSE-2.0
         *
         * Unless required by applicable law or agreed to in writing, software
         * distributed under the License is distributed on an "AS IS" BASIS,
         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         * See the License for the specific language governing permissions and
         * limitations under the License.
         */
    /**
         * Adapted from LucenePDFDocument.java from PDFBox lucene project
         *
         * This class is used to create a document for the lucene search engine.
         * This should easily plug into the IndexHTML or IndexFiles that comes
         * with the lucene project. This class will populate the following
         * fields.
         * <table> <tr> <th>Lucene Field Name</th> <th>Description</th> </tr>
         * <tr>
         * <td>path</td> <td>File system path if loaded from a file</td> </tr>
         * <tr>
         * <td>url</td> <td>URL to PDF document</td> </tr> <tr>
         * <td>contents</td>
         * <td>Entire contents of PDF document, indexed but not stored</td>
         * </tr>
         * <tr> <td>summary</td> <td>First 500 characters of content</td> </tr>
         * <tr>
         * <td>modified</td> <td>The modified date/time according to the url or
         * path</td> </tr> <tr> <td>uid</td> <td>A unique identifier for the
         * Lucene document.</td> </tr> <tr> <td>CreationDate</td> <td>From PDF
         * meta-data if available</td> </tr> <tr> <td>Creator</td> <td>From PDF
         * meta-data if available</td> </tr> <tr> <td>Keywords</td> <td>From PDF
         * meta-data if available</td> </tr> <tr> <td>ModificationDate</td>
         * <td>From PDF meta-data if available</td> </tr> <tr> <td>Producer</td>
         * <td>From PDF meta-data if available</td> </tr> <tr> <td>Subject</td>
         * <td>From PDF meta-data if available</td> </tr> <tr> <td>Trapped</td>
         * <td>From PDF meta-data if available</td> </tr> <tr>
         * <td>Encrypted</td> <td>From PDF meta-data if available</td> </tr>
         * </table>
         *
         * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
         * @version $Revision: 1.23 $
         *
         * @throws IOException If there is an error parsing the document.
         */
    PDDocument pdfDocument = null;
    ConvertedDocument textdoc = new ConvertedDocument(doc);
    try {
        pdfDocument = PDDocument.load(doc);
        if (pdfDocument.isEncrypted()) {
            //Just try using the default password and move on
            // Even if the doc is encrypted, apparently you can try. Throw exception if it fails.
            textdoc.addProperty("encrypted", "YES");
        }
        //create a writer where to append the text content.
        StringWriter writer = new StringWriter();
        stripper.resetEngine();
        stripper.writeText(pdfDocument, writer);
        PDDocumentInformation info = pdfDocument.getDocumentInformation();
        if (info != null) {
            textdoc.addAuthor(info.getAuthor());
            try {
                textdoc.addCreateDate(info.getCreationDate());
            } catch (IOException io) {
            //ignore, bad date but continue with indexing
            }
            textdoc.addProperty("creator_tool", info.getCreator());
            textdoc.addProperty("keywords", info.getKeywords());
            /* try {
                 metadata.add("ModificationDate", info.getModificationDate());
                 } catch (IOException io) {
                 //ignore, bad date but continue with indexing
                 } */
            //metadata.add("Producer", info.getProducer());
            textdoc.addProperty("subject", info.getSubject());
            String ttl = info.getTitle();
            if (ttl == null || "untitled".equalsIgnoreCase(ttl)) {
                ttl = textdoc.filename;
            }
            textdoc.addTitle(ttl);
            // metadata.add("Trapped", info.getTrapped());
            // TODO: Character set is what?
            textdoc.setEncoding("UTF-8");
        }
        // Note: the buffer to string operation is costless;
        // the char array value of the writer buffer and the content string
        // is shared as long as the buffer content is not modified, which will
        // not occur here.
        textdoc.setText(writer.getBuffer().toString());
        return textdoc;
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
    }
}
Also used : StringWriter(java.io.StringWriter) PDDocument(org.apache.pdfbox.pdmodel.PDDocument) IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) PDDocumentInformation(org.apache.pdfbox.pdmodel.PDDocumentInformation)

Aggregations

ConvertedDocument (org.opensextant.xtext.ConvertedDocument)17 IOException (java.io.IOException)11 Content (org.opensextant.xtext.Content)4 File (java.io.File)3 Metadata (org.apache.tika.metadata.Metadata)3 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)3 HashMap (java.util.HashMap)2 Test (org.junit.Test)2 CharsetMatch (com.ibm.icu.text.CharsetMatch)1 StringWriter (java.io.StringWriter)1 URL (java.net.URL)1 ParseException (java.text.ParseException)1 MimeType (javax.activation.MimeType)1 MessagingException (javax.mail.MessagingException)1 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)1 PDDocumentInformation (org.apache.pdfbox.pdmodel.PDDocumentInformation)1 MAPIMessage (org.apache.poi.hsmf.MAPIMessage)1 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)1 ChunkNotFoundException (org.apache.poi.hsmf.exceptions.ChunkNotFoundException)1 WordExtractor (org.apache.poi.hwpf.extractor.WordExtractor)1