Search in sources :

Example 11 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class TextExtractor method processControlWord.

// Handle control word that takes a parameter:
private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException {
    // JFlex), which uses single-pass FSM to do cmp:
    if (inHeader) {
        if (equals("ansicpg")) {
            // ANSI codepage
            Charset cs = ANSICPG_MAP.get(param);
            if (cs != null) {
                globalCharset = cs;
            }
        } else if (equals("deff")) {
            // Default font
            globalDefaultFont = param;
        } else if (equals("nofpages")) {
            metadata.add(Office.PAGE_COUNT, Integer.toString(param));
        } else if (equals("nofwords")) {
            metadata.add(Office.WORD_COUNT, Integer.toString(param));
        } else if (equals("nofchars")) {
            metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
        } else if (equals("yr")) {
            year = param;
        } else if (equals("mo")) {
            month = param;
        } else if (equals("dy")) {
            day = param;
        } else if (equals("hr")) {
            hour = param;
        } else if (equals("min")) {
            minute = param;
        }
        if (fontTableState == 1) {
            // mappings of fN to the fcharset:
            if (groupState.depth < fontTableDepth) {
                fontTableState = 2;
            } else {
                if (equals("f")) {
                    // Start new font definition
                    curFontID = param;
                } else if (equals("fcharset")) {
                    Charset cs = FCHARSET_MAP.get(param);
                    if (cs != null) {
                        fontToCharset.put(curFontID, cs);
                    }
                }
            }
        }
        if (currentList != null) {
            if (equals("listid")) {
                currentList.id = param;
                currentListTable.put(currentList.id, currentList);
            } else if (equals("listtemplateid")) {
                currentList.templateID = param;
            } else if (equals("levelnfc") || equals("levelnfcn")) {
                //sanity check to make sure list information isn't corrupt
                if (listTableLevel > -1 && listTableLevel < currentList.numberType.length) {
                    currentList.numberType[listTableLevel] = param;
                }
            }
        }
    } else {
        // In document
        if (equals("b")) {
            // b0
            assert param == 0;
            if (groupState.bold) {
                pushText();
                if (groupState.italic) {
                    end("i");
                }
                end("b");
                if (groupState.italic) {
                    start("i");
                }
                groupState.bold = false;
            }
        } else if (equals("i")) {
            // i0
            assert param == 0;
            if (groupState.italic) {
                pushText();
                end("i");
                groupState.italic = false;
            }
        } else if (equals("f")) {
            // Change current font
            Charset fontCharset = fontToCharset.get(param);
            // Push any buffered text before changing
            // font:
            pushText();
            if (fontCharset != null) {
                groupState.fontCharset = fontCharset;
            } else {
                // DOC ERROR: font change referenced a
                // non-table'd font number
                // TODO: log a warning?  Throw an exc?
                groupState.fontCharset = null;
            }
        } else if (equals("ls")) {
            groupState.list = param;
        } else if (equals("lslvl")) {
            groupState.listLevel = param;
        }
    }
    // in the header can be unicode escaped as well:
    if (equals("u")) {
        // Unicode escape
        if (!groupState.ignore || groupState.sv || groupState.sn) {
            final char utf16CodeUnit = (char) (param & 0xffff);
            addOutputChar(utf16CodeUnit);
        }
        // After seeing a unicode escape we must
        // skip the next ucSkip ansi chars (the
        // "unicode shadow")
        ansiSkip = groupState.ucSkip;
    } else if (equals("uc")) {
        // Change unicode shadow length
        groupState.ucSkip = param;
    } else if (equals("bin")) {
        if (param >= 0) {
            if (groupState.pictDepth == 1) {
                try {
                    embObjHandler.writeBytes(in, param);
                } catch (IOException | TikaException e) {
                    EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                    embObjHandler.reset();
                }
            } else {
                IOUtils.skipFully(in, param);
            }
        } else {
        // log some warning?
        }
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) Charset(java.nio.charset.Charset) IOException(java.io.IOException)

Example 12 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class RTFParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
    TaggedInputStream tagged = new TaggedInputStream(stream);
    try {
        XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
        RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb());
        final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
        ert.extract(stream);
    } catch (IOException e) {
        tagged.throwIfCauseOf(e);
        throw new TikaException("Error parsing an RTF document", e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TaggedInputStream(org.apache.commons.io.input.TaggedInputStream) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 13 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ZipContainerDetector method detect.

public MediaType detect(InputStream input, Metadata metadata) throws IOException {
    // Check if we have access to the document
    if (input == null) {
        return MediaType.OCTET_STREAM;
    }
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tis = TikaInputStream.get(input, tmp);
        // enough for all known formats
        byte[] prefix = new byte[1024];
        int length = tis.peek(prefix);
        MediaType type = detectArchiveFormat(prefix, length);
        if (PackageParser.isZipArchive(type) && TikaInputStream.isTikaInputStream(input)) {
            return detectZipFormat(tis);
        } else if (!type.equals(MediaType.OCTET_STREAM)) {
            return type;
        } else {
            return detectCompressorFormat(prefix, length);
        }
    } finally {
        try {
            tmp.dispose();
        } catch (TikaException e) {
        // ignore
        }
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) MediaType(org.apache.tika.mime.MediaType)

Example 14 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class PRTParser method extractText.

/**
     * Does our best to turn the bytes into text
     */
private String extractText(byte[] data, boolean trim) throws TikaException {
    // The text is always stored null terminated, but sometimes
    //  may have extra null padding too
    int length = data.length - 1;
    if (trim) {
        for (int i = 0; i < data.length; i++) {
            if (data[i] == 0) {
                length = i;
                break;
            }
        }
    }
    // We believe that the text is basically stored as CP437
    // That said, there are a few characters slightly wrong for that...
    String text;
    try {
        text = new String(data, 0, length, "cp437");
    } catch (UnsupportedEncodingException e) {
        throw new TikaException("JVM Broken, core codepage CP437 missing!");
    }
    // Fix up the known character issues
    text = text.replace("φ", "Ø");
    // All done, as best as we can!
    return text;
}
Also used : TikaException(org.apache.tika.exception.TikaException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Example 15 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class TikaEncodingDetectorTest method testConfigurabilityOfUserSpecified.

@Test
public void testConfigurabilityOfUserSpecified() throws Exception {
    TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml"));
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    //make sure that all static and non-static parsers are using the same encoding detector!
    List<Parser> parsers = new ArrayList<>();
    findEncodingDetectionParsers(p, parsers);
    assertEquals(3, parsers.size());
    for (Parser encodingDetectingParser : parsers) {
        EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) encodingDetectingParser).getEncodingDetector();
        assertTrue(encodingDetector instanceof CompositeEncodingDetector);
        assertEquals(2, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
        for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector).getDetectors()) {
            assertNotContained("cu4j", child.getClass().getCanonicalName());
        }
    }
    //also just make sure this is still true
    try {
        Metadata metadata = getXML("english.cp500.txt", p).metadata;
        fail("can't detect w/out ICU");
    } catch (TikaException e) {
        assertContains("Failed to detect", e.getMessage());
    }
}
Also used : Icu4jEncodingDetector(org.apache.tika.parser.txt.Icu4jEncodingDetector) NonDetectingEncodingDetector(org.apache.tika.detect.NonDetectingEncodingDetector) UniversalEncodingDetector(org.apache.tika.parser.txt.UniversalEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) EncodingDetector(org.apache.tika.detect.EncodingDetector) HtmlEncodingDetector(org.apache.tika.parser.html.HtmlEncodingDetector) CompositeEncodingDetector(org.apache.tika.detect.CompositeEncodingDetector) TikaException(org.apache.tika.exception.TikaException) ArrayList(java.util.ArrayList) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TXTParser(org.apache.tika.parser.txt.TXTParser) AbstractEncodingDetectorParser(org.apache.tika.parser.AbstractEncodingDetectorParser) Test(org.junit.Test)

Aggregations

TikaException (org.apache.tika.exception.TikaException)142 IOException (java.io.IOException)54 SAXException (org.xml.sax.SAXException)42 InputStream (java.io.InputStream)37 TikaInputStream (org.apache.tika.io.TikaInputStream)33 Metadata (org.apache.tika.metadata.Metadata)33 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)29 Test (org.junit.Test)19 ParseContext (org.apache.tika.parser.ParseContext)18 ContentHandler (org.xml.sax.ContentHandler)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)15 TemporaryResources (org.apache.tika.io.TemporaryResources)15 MediaType (org.apache.tika.mime.MediaType)13 Parser (org.apache.tika.parser.Parser)13 ByteArrayInputStream (java.io.ByteArrayInputStream)12 ArrayList (java.util.ArrayList)11 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)11 File (java.io.File)8 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)8