Search in sources :

Example 61 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class JackcessExtractor method handleOLE.

private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
    OleBlob blob = row.getBlob(cName);
    //lifted shamelessly from Jackcess's OleBlobTest
    if (blob == null)
        return;
    OleBlob.Content content = blob.getContent();
    if (content == null)
        return;
    switch(content.getType()) {
        case LINK:
            xhtml.characters(((OleBlob.LinkContent) content).getLinkPath());
            break;
        case SIMPLE_PACKAGE:
            OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content;
            //TODO: find test file that has this kind of attachment
            //and see if getFilePath or getLocalFilePath is meaningful
            //for TikaCoreProperties.ORIGINAL_RESOURCE_NAME
            TikaInputStream tis = null;
            try {
                tis = TikaInputStream.get(spc.getStream());
            } catch (IOException e) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                break;
            }
            if (tis != null) {
                try {
                    handleEmbeddedResource(tis, //filename
                    spc.getFileName(), //relationshipId
                    null, //mediatype
                    spc.getTypeName(), xhtml, false);
                } finally {
                    IOUtils.closeQuietly(tis);
                }
            }
            break;
        case OTHER:
            OleBlob.OtherContent oc = (OleBlob.OtherContent) content;
            TikaInputStream ocStream = null;
            try {
                ocStream = TikaInputStream.get(oc.getStream());
            } catch (IOException e) {
                EmbeddedDocumentUtil.recordException(e, parentMetadata);
            }
            try {
                handleEmbeddedResource(ocStream, //filename
                null, //relationshipId
                null, //mediatype
                oc.getTypeName(), xhtml, false);
            } finally {
                IOUtils.closeQuietly(ocStream);
            }
            break;
        case COMPOUND_STORAGE:
            OleBlob.CompoundContent cc = (OleBlob.CompoundContent) content;
            handleCompoundContent(cc, xhtml);
            break;
    }
}
Also used : OleBlob(com.healthmarketscience.jackcess.util.OleBlob) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException)

Example 62 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class ISATabUtils method parseStudy.

public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
        CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        Iterator<CSVRecord> iterator = csvParser.iterator();
        xhtml.startElement("table");
        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");
        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) CSVParser(org.apache.commons.csv.CSVParser) TikaInputStream(org.apache.tika.io.TikaInputStream) CSVRecord(org.apache.commons.csv.CSVRecord) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 63 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class ISATabUtils method parseAssay.

public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
        CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        xhtml.startElement("table");
        Iterator<CSVRecord> iterator = csvParser.iterator();
        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");
        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) CSVParser(org.apache.commons.csv.CSVParser) TikaInputStream(org.apache.tika.io.TikaInputStream) CSVRecord(org.apache.commons.csv.CSVRecord) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 64 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class ISArchiveParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TemporaryResources tmp = TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
    TikaInputStream tis = TikaInputStream.get(stream, tmp);
    try {
        if (this.location == null) {
            this.location = tis.getFile().getParent() + File.separator;
        }
        this.studyFileName = tis.getFile().getName();
        File locationFile = new File(location);
        String[] investigationList = locationFile.list(new FilenameFilter() {

            @Override
            public boolean accept(File dir, String name) {
                return name.matches("i_.+\\.txt");
            }
        });
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        parseInvestigation(investigationList, xhtml, metadata, context);
        parseStudy(stream, xhtml, metadata, context);
        parseAssay(xhtml, metadata, context);
        xhtml.endDocument();
    } finally {
        if (tmp != null) {
            tmp.dispose();
        }
    }
}
Also used : FilenameFilter(java.io.FilenameFilter) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) File(java.io.File)

Example 65 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class JDBCTableReader method handleBlob.

protected void handleBlob(String tableName, String columnName, int rowNum, ResultSet resultSet, int columnIndex, ContentHandler handler, ParseContext context) throws SQLException, IOException, SAXException {
    Metadata m = new Metadata();
    m.set(Database.TABLE_NAME, tableName);
    m.set(Database.COLUMN_NAME, columnName);
    m.set(Database.PREFIX + "ROW_NUM", Integer.toString(rowNum));
    m.set(Database.PREFIX + "IS_BLOB", "true");
    Blob blob = null;
    TikaInputStream is = null;
    try {
        blob = getBlob(resultSet, columnIndex, m);
        if (blob == null) {
            return;
        }
        is = TikaInputStream.get(blob, m);
        Attributes attrs = new AttributesImpl();
        ((AttributesImpl) attrs).addAttribute("", "type", "type", "CDATA", "blob");
        ((AttributesImpl) attrs).addAttribute("", "column_name", "column_name", "CDATA", columnName);
        ((AttributesImpl) attrs).addAttribute("", "row_number", "row_number", "CDATA", Integer.toString(rowNum));
        handler.startElement("", "span", "span", attrs);
        String extension = embeddedDocumentUtil.getExtension(is, m);
        m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, //just in case something screwy is going on with the column name
        FilenameUtils.normalize(FilenameUtils.getName(columnName + "_" + rowNum + extension)));
        if (embeddedDocumentUtil.shouldParseEmbedded(m)) {
            embeddedDocumentUtil.parseEmbedded(is, handler, m, true);
        }
    } finally {
        if (blob != null) {
            try {
                blob.free();
            } catch (SQLException | UnsupportedOperationException e) {
            //swallow
            }
        }
        IOUtils.closeQuietly(is);
    }
    handler.endElement("", "span", "span");
}
Also used : Blob(java.sql.Blob) AttributesImpl(org.xml.sax.helpers.AttributesImpl) SQLException(java.sql.SQLException) Metadata(org.apache.tika.metadata.Metadata) Attributes(org.xml.sax.Attributes) TikaInputStream(org.apache.tika.io.TikaInputStream)

Aggregations

TikaInputStream (org.apache.tika.io.TikaInputStream)100 Metadata (org.apache.tika.metadata.Metadata)40 TemporaryResources (org.apache.tika.io.TemporaryResources)28 IOException (java.io.IOException)27 TikaException (org.apache.tika.exception.TikaException)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)23 Test (org.junit.Test)20 InputStream (java.io.InputStream)19 File (java.io.File)15 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)15 ContentHandler (org.xml.sax.ContentHandler)14 TikaTest (org.apache.tika.TikaTest)13 MediaType (org.apache.tika.mime.MediaType)13 SAXException (org.xml.sax.SAXException)13 ParseContext (org.apache.tika.parser.ParseContext)12 ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)8 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)6 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6