Search in sources :

Example 36 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class TikaResource method produceOutput.

private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders, final UriInfo info, final String format) {
    final Parser parser = createParser();
    final Metadata metadata = new Metadata();
    final ParseContext context = new ParseContext();
    fillMetadata(parser, metadata, context, httpHeaders);
    fillParseContext(context, httpHeaders, parser);
    logRequest(LOG, info, metadata);
    return new StreamingOutput() {

        public void write(OutputStream outputStream) throws IOException, WebApplicationException {
            Writer writer = new OutputStreamWriter(outputStream, UTF_8);
            ContentHandler content;
            try {
                SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
                TransformerHandler handler = factory.newTransformerHandler();
                handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
                handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
                handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, UTF_8.name());
                handler.setResult(new StreamResult(writer));
                content = new ExpandedTitleContentHandler(handler);
            } catch (TransformerConfigurationException e) {
                throw new WebApplicationException(e);
            }
            parse(parser, LOG, info.getPath(), is, content, metadata, context);
        }
    };
}
Also used : TransformerHandler(javax.xml.transform.sax.TransformerHandler) StreamResult(javax.xml.transform.stream.StreamResult) TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) WebApplicationException(javax.ws.rs.WebApplicationException) OutputStream(java.io.OutputStream) Metadata(org.apache.tika.metadata.Metadata) SAXTransformerFactory(javax.xml.transform.sax.SAXTransformerFactory) StreamingOutput(javax.ws.rs.core.StreamingOutput) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Example 37 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class UnpackerResource method process.

private Map<String, byte[]> process(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info, boolean saveAll) throws Exception {
    Metadata metadata = new Metadata();
    ParseContext pc = new ParseContext();
    Parser parser = TikaResource.createParser();
    if (parser instanceof DigestingParser) {
        //no need to digest for unwrapping
        parser = ((DigestingParser) parser).getWrappedParser();
    }
    TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
    TikaResource.logRequest(LOG, info, metadata);
    ContentHandler ch;
    ByteArrayOutputStream text = new ByteArrayOutputStream();
    if (saveAll) {
        ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8)));
    } else {
        ch = new DefaultHandler();
    }
    Map<String, byte[]> files = new HashMap<>();
    MutableInt count = new MutableInt();
    pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
    TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
    if (count.intValue() == 0 && !saveAll) {
        throw new WebApplicationException(Response.Status.NO_CONTENT);
    }
    if (saveAll) {
        files.put(TEXT_FILENAME, text.toByteArray());
        ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
        metadataToCsv(metadata, metaStream);
        files.put(META_FILENAME, metaStream.toByteArray());
    }
    return files;
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) WebApplicationException(javax.ws.rs.WebApplicationException) HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) DigestingParser(org.apache.tika.parser.DigestingParser) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) Parser(org.apache.tika.parser.Parser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) DigestingParser(org.apache.tika.parser.DigestingParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) MutableInt(org.apache.commons.lang.mutable.MutableInt) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter)

Example 38 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method testIgnoreCharsetDetectorLanguage.

/**
     * Test case for TIKA-339: Don't use language returned by CharsetDetector
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
     */
@Test
public void testIgnoreCharsetDetectorLanguage() throws Exception {
    String test = "<html><title>Simple Content</title><body></body></html>";
    Metadata metadata = new Metadata();
    metadata.add(Metadata.CONTENT_LANGUAGE, "en");
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 39 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class HtmlParserTest method testImgUrlExtraction.

/**
     * Test case for TIKA-463. Don't skip elements that have URLs.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
     */
@Test
public void testImgUrlExtraction() throws Exception {
    final String test = "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" + "</head><body><img src=\"image.jpg\" /></body></html>";
    StringWriter sw = new StringWriter();
    new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw), new Metadata(), new ParseContext());
    String result = sw.toString();
    // <img> tag should exist, with fully resolved URL
    assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
}
Also used : StringWriter(java.io.StringWriter) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 40 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class BatchProcessBuilder method build.

/**
     * Builds a BatchProcess from runtime arguments and a
     * input stream of a configuration file.  With the exception of the QueueBuilder,
     * the builders choose how to adjudicate between
     * runtime arguments and the elements in the configuration file.
     * <p/>
     * This does not close the InputStream!
     * @param is inputStream
     * @param runtimeAttributes incoming runtime attributes
     * @return batch process
     * @throws java.io.IOException
     */
public BatchProcess build(InputStream is, Map<String, String> runtimeAttributes) throws IOException {
    Document doc = null;
    try {
        DocumentBuilder docBuilder = new ParseContext().getDocumentBuilder();
        doc = docBuilder.parse(is);
    } catch (TikaException | SAXException e) {
        throw new IOExceptionWithCause(e);
    }
    Node docElement = doc.getDocumentElement();
    return build(docElement, runtimeAttributes);
}
Also used : IOExceptionWithCause(org.apache.tika.io.IOExceptionWithCause) TikaException(org.apache.tika.exception.TikaException) DocumentBuilder(javax.xml.parsers.DocumentBuilder) Node(org.w3c.dom.Node) ParseContext(org.apache.tika.parser.ParseContext) Document(org.w3c.dom.Document) SAXException(org.xml.sax.SAXException)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)338 Metadata (org.apache.tika.metadata.Metadata)283 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)164 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)118 Parser (org.apache.tika.parser.Parser)109 ByteArrayInputStream (java.io.ByteArrayInputStream)92 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)30 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)25 SAXException (org.xml.sax.SAXException)25 CompositeParser (org.apache.tika.parser.CompositeParser)22 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)20