use of org.xml.sax.ContentHandler in project spring-framework by spring-projects.
the class AbstractStaxXMLReaderTestCase method contentHandlerNoNamespacesPrefixes.
@Test
public void contentHandlerNoNamespacesPrefixes() throws Exception {
standardReader.setFeature("http://xml.org/sax/features/namespaces", false);
standardReader.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
standardReader.parse(new InputSource(createTestInputStream()));
AbstractStaxXMLReader staxXmlReader = createStaxXmlReader(createTestInputStream());
ContentHandler contentHandler = mockContentHandler();
staxXmlReader.setFeature("http://xml.org/sax/features/namespaces", false);
staxXmlReader.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
staxXmlReader.setContentHandler(contentHandler);
staxXmlReader.parse(new InputSource());
verifyIdenticalInvocations(standardContentHandler, contentHandler);
}
use of org.xml.sax.ContentHandler in project spring-framework by spring-projects.
the class AbstractStaxXMLReaderTestCase method mockContentHandler.
protected final ContentHandler mockContentHandler() throws Exception {
ContentHandler contentHandler = mock(ContentHandler.class);
willAnswer(new CopyCharsAnswer()).given(contentHandler).characters(any(char[].class), anyInt(), anyInt());
willAnswer(new CopyCharsAnswer()).given(contentHandler).ignorableWhitespace(any(char[].class), anyInt(), anyInt());
willAnswer(new Answer<Object>() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
invocation.getArguments()[3] = new AttributesImpl((Attributes) invocation.getArguments()[3]);
return null;
}
}).given(contentHandler).startElement(anyString(), anyString(), anyString(), any(Attributes.class));
return contentHandler;
}
use of org.xml.sax.ContentHandler in project spring-framework by spring-projects.
the class AbstractStaxXMLReaderTestCase method contentHandlerNamespacesPrefixes.
@Test
public void contentHandlerNamespacesPrefixes() throws Exception {
standardReader.setFeature("http://xml.org/sax/features/namespaces", true);
standardReader.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
standardReader.parse(new InputSource(createTestInputStream()));
AbstractStaxXMLReader staxXmlReader = createStaxXmlReader(createTestInputStream());
ContentHandler contentHandler = mockContentHandler();
staxXmlReader.setFeature("http://xml.org/sax/features/namespaces", true);
staxXmlReader.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
staxXmlReader.setContentHandler(contentHandler);
staxXmlReader.parse(new InputSource());
verifyIdenticalInvocations(standardContentHandler, contentHandler);
}
use of org.xml.sax.ContentHandler in project ddf by codice.
the class TikaInputTransformer method transform.
@Override
public Metacard transform(InputStream input, String id) throws IOException, CatalogTransformerException {
LOGGER.debug("Transforming input stream using Tika.");
if (input == null) {
throw new CatalogTransformerException("Cannot transform null input.");
}
try (TemporaryFileBackedOutputStream fileBackedOutputStream = new TemporaryFileBackedOutputStream()) {
try {
IOUtils.copy(input, fileBackedOutputStream);
} catch (IOException e) {
throw new CatalogTransformerException("Could not copy bytes of content message.", e);
}
Parser parser = new AutoDetectParser();
ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler();
ToTextContentHandler textContentHandler = null;
ContentHandler contentHandler;
if (!contentMetadataExtractors.isEmpty()) {
textContentHandler = new ToTextContentHandler();
contentHandler = new TeeContentHandler(xmlContentHandler, textContentHandler);
} else {
contentHandler = xmlContentHandler;
}
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
Metadata metadata;
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy, new ParseContext());
}
String metadataText = xmlContentHandler.toString();
if (templates != null) {
metadataText = transformToXml(metadataText);
}
String metacardContentType = metadata.get(Metadata.CONTENT_TYPE);
MetacardType metacardType = getMetacardTypeFromMimeType(metacardContentType);
if (metacardType == null) {
metacardType = commonTikaMetacardType;
}
Metacard metacard;
if (textContentHandler != null) {
String plainText = textContentHandler.toString();
Set<AttributeDescriptor> attributes = contentMetadataExtractors.values().stream().map(ContentMetadataExtractor::getMetacardAttributes).flatMap(Collection::stream).collect(Collectors.toSet());
MetacardTypeImpl extendedMetacardType = new MetacardTypeImpl(metacardType.getName(), metacardType, attributes);
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, extendedMetacardType);
for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) {
contentMetadataExtractor.process(plainText, metacard);
}
} else {
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, metacardType);
}
if (StringUtils.isNotBlank(metacardContentType)) {
metacard.setAttribute(new AttributeImpl(Core.DATATYPE, getDatatype(metacardContentType)));
}
if (StringUtils.startsWith(metacardContentType, "image")) {
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
createThumbnail(inputStreamCopy, metacard);
}
}
LOGGER.debug("Finished transforming input stream using Tika.");
return metacard;
}
}
use of org.xml.sax.ContentHandler in project tika by apache.
the class ParserPostProcessor method parse.
/**
* Forwards the call to the delegated parser and post-processes the
* results as described above.
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
ContentHandler body = new BodyContentHandler();
ContentHandler tee = new TeeContentHandler(handler, body);
super.parse(stream, tee, metadata, context);
String content = body.toString();
metadata.set("fulltext", content);
int length = Math.min(content.length(), 500);
metadata.set("summary", content.substring(0, length));
for (String link : RegexUtils.extractLinks(content)) {
metadata.add("outlinks", link);
}
}
Aggregations