Search in sources :

Example 46 with ContentHandler

use of org.xml.sax.ContentHandler in project spring-framework by spring-projects.

the class AbstractStaxXMLReaderTestCase method contentHandlerNoNamespacesPrefixes.

@Test
public void contentHandlerNoNamespacesPrefixes() throws Exception {
    standardReader.setFeature("http://xml.org/sax/features/namespaces", false);
    standardReader.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
    standardReader.parse(new InputSource(createTestInputStream()));
    AbstractStaxXMLReader staxXmlReader = createStaxXmlReader(createTestInputStream());
    ContentHandler contentHandler = mockContentHandler();
    staxXmlReader.setFeature("http://xml.org/sax/features/namespaces", false);
    staxXmlReader.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
    staxXmlReader.setContentHandler(contentHandler);
    staxXmlReader.parse(new InputSource());
    verifyIdenticalInvocations(standardContentHandler, contentHandler);
}
Also used : InputSource(org.xml.sax.InputSource) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 47 with ContentHandler

use of org.xml.sax.ContentHandler in project spring-framework by spring-projects.

the class AbstractStaxXMLReaderTestCase method mockContentHandler.

protected final ContentHandler mockContentHandler() throws Exception {
    ContentHandler contentHandler = mock(ContentHandler.class);
    willAnswer(new CopyCharsAnswer()).given(contentHandler).characters(any(char[].class), anyInt(), anyInt());
    willAnswer(new CopyCharsAnswer()).given(contentHandler).ignorableWhitespace(any(char[].class), anyInt(), anyInt());
    willAnswer(new Answer<Object>() {

        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            invocation.getArguments()[3] = new AttributesImpl((Attributes) invocation.getArguments()[3]);
            return null;
        }
    }).given(contentHandler).startElement(anyString(), anyString(), anyString(), any(Attributes.class));
    return contentHandler;
}
Also used : AttributesImpl(org.xml.sax.helpers.AttributesImpl) InvocationOnMock(org.mockito.invocation.InvocationOnMock) Attributes(org.xml.sax.Attributes) ContentHandler(org.xml.sax.ContentHandler)

Example 48 with ContentHandler

use of org.xml.sax.ContentHandler in project spring-framework by spring-projects.

the class AbstractStaxXMLReaderTestCase method contentHandlerNamespacesPrefixes.

@Test
public void contentHandlerNamespacesPrefixes() throws Exception {
    standardReader.setFeature("http://xml.org/sax/features/namespaces", true);
    standardReader.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
    standardReader.parse(new InputSource(createTestInputStream()));
    AbstractStaxXMLReader staxXmlReader = createStaxXmlReader(createTestInputStream());
    ContentHandler contentHandler = mockContentHandler();
    staxXmlReader.setFeature("http://xml.org/sax/features/namespaces", true);
    staxXmlReader.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
    staxXmlReader.setContentHandler(contentHandler);
    staxXmlReader.parse(new InputSource());
    verifyIdenticalInvocations(standardContentHandler, contentHandler);
}
Also used : InputSource(org.xml.sax.InputSource) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 49 with ContentHandler

use of org.xml.sax.ContentHandler in project ddf by codice.

the class TikaInputTransformer method transform.

@Override
public Metacard transform(InputStream input, String id) throws IOException, CatalogTransformerException {
    LOGGER.debug("Transforming input stream using Tika.");
    if (input == null) {
        throw new CatalogTransformerException("Cannot transform null input.");
    }
    try (TemporaryFileBackedOutputStream fileBackedOutputStream = new TemporaryFileBackedOutputStream()) {
        try {
            IOUtils.copy(input, fileBackedOutputStream);
        } catch (IOException e) {
            throw new CatalogTransformerException("Could not copy bytes of content message.", e);
        }
        Parser parser = new AutoDetectParser();
        ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler();
        ToTextContentHandler textContentHandler = null;
        ContentHandler contentHandler;
        if (!contentMetadataExtractors.isEmpty()) {
            textContentHandler = new ToTextContentHandler();
            contentHandler = new TeeContentHandler(xmlContentHandler, textContentHandler);
        } else {
            contentHandler = xmlContentHandler;
        }
        TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
        Metadata metadata;
        try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
            metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy, new ParseContext());
        }
        String metadataText = xmlContentHandler.toString();
        if (templates != null) {
            metadataText = transformToXml(metadataText);
        }
        String metacardContentType = metadata.get(Metadata.CONTENT_TYPE);
        MetacardType metacardType = getMetacardTypeFromMimeType(metacardContentType);
        if (metacardType == null) {
            metacardType = commonTikaMetacardType;
        }
        Metacard metacard;
        if (textContentHandler != null) {
            String plainText = textContentHandler.toString();
            Set<AttributeDescriptor> attributes = contentMetadataExtractors.values().stream().map(ContentMetadataExtractor::getMetacardAttributes).flatMap(Collection::stream).collect(Collectors.toSet());
            MetacardTypeImpl extendedMetacardType = new MetacardTypeImpl(metacardType.getName(), metacardType, attributes);
            metacard = MetacardCreator.createMetacard(metadata, id, metadataText, extendedMetacardType);
            for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) {
                contentMetadataExtractor.process(plainText, metacard);
            }
        } else {
            metacard = MetacardCreator.createMetacard(metadata, id, metadataText, metacardType);
        }
        if (StringUtils.isNotBlank(metacardContentType)) {
            metacard.setAttribute(new AttributeImpl(Core.DATATYPE, getDatatype(metacardContentType)));
        }
        if (StringUtils.startsWith(metacardContentType, "image")) {
            try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
                createThumbnail(inputStreamCopy, metacard);
            }
        }
        LOGGER.debug("Finished transforming input stream using Tika.");
        return metacard;
    }
}
Also used : ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) TemporaryFileBackedOutputStream(org.codice.ddf.platform.util.TemporaryFileBackedOutputStream) CloseShieldInputStream(org.apache.tika.io.CloseShieldInputStream) InputStream(java.io.InputStream) AttributeImpl(ddf.catalog.data.impl.AttributeImpl) Metadata(org.apache.tika.metadata.Metadata) AttributeDescriptor(ddf.catalog.data.AttributeDescriptor) CatalogTransformerException(ddf.catalog.transform.CatalogTransformerException) MetacardTypeImpl(ddf.catalog.data.impl.MetacardTypeImpl) IOException(java.io.IOException) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ToTextContentHandler(org.apache.tika.sax.ToTextContentHandler) ContentHandler(org.xml.sax.ContentHandler) MetacardType(ddf.catalog.data.MetacardType) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ToTextContentHandler(org.apache.tika.sax.ToTextContentHandler) TikaMetadataExtractor(ddf.catalog.transformer.common.tika.TikaMetadataExtractor) Metacard(ddf.catalog.data.Metacard) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ContentMetadataExtractor(ddf.catalog.content.operation.ContentMetadataExtractor) TeeContentHandler(org.apache.tika.sax.TeeContentHandler)

Example 50 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ParserPostProcessor method parse.

/**
     * Forwards the call to the delegated parser and post-processes the
     * results as described above.
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    ContentHandler body = new BodyContentHandler();
    ContentHandler tee = new TeeContentHandler(handler, body);
    super.parse(stream, tee, metadata, context);
    String content = body.toString();
    metadata.set("fulltext", content);
    int length = Math.min(content.length(), 500);
    metadata.set("summary", content.substring(0, length));
    for (String link : RegexUtils.extractLinks(content)) {
        metadata.add("outlinks", link);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20