use of org.apache.tika.sax.ToXMLContentHandler in project tika by apache.
the class TikaTest method getXML.
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
if (context == null) {
context = new ParseContext();
}
try {
ContentHandler handler = new ToXMLContentHandler();
parser.parse(input, handler, metadata, context);
return new XMLResult(handler.toString(), metadata);
} finally {
input.close();
}
}
use of org.apache.tika.sax.ToXMLContentHandler in project tika by apache.
the class ContentHandlerExample method parseBodyToHTML.
/**
* Example of extracting just the body as HTML, without the
* head part, as a string
*/
public String parseBodyToHTML() throws IOException, SAXException, TikaException {
ContentHandler handler = new BodyContentHandler(new ToXMLContentHandler());
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
parser.parse(stream, handler, metadata);
return handler.toString();
}
}
use of org.apache.tika.sax.ToXMLContentHandler in project tika by apache.
the class EnviHeaderParserTest method testParseGlobalMetadata.
@Test
public void testParseGlobalMetadata() throws Exception {
if (System.getProperty("java.version").startsWith("1.5")) {
return;
}
Parser parser = new EnviHeaderParser();
ToXMLContentHandler handler = new ToXMLContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = EnviHeaderParser.class.getResourceAsStream("/test-documents/envi_test_header.hdr")) {
assertNotNull("Test ENVI file not found", stream);
parser.parse(stream, handler, metadata, new ParseContext());
}
// Check content of test file
String content = handler.toString();
assertContains("<body><p>ENVI</p>", content);
assertContains("<p>samples = 2400</p>", content);
assertContains("<p>lines = 2400</p>", content);
assertContains("<p>map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, 5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}</p>", content);
assertContains("content=\"application/envi.hdr\"", content);
assertContains("projection info = {16, 6371007.2, 0.000000, 0.0, 0.0, Sinusoidal, units=Meters}", content);
}
use of org.apache.tika.sax.ToXMLContentHandler in project tika by apache.
the class SQLite3ParserTest method testNotAddingEmbeddedParserToParseContext.
//test what happens if the user does not want embedded docs handled
@Test
public void testNotAddingEmbeddedParserToParseContext() throws Exception {
Parser p = new AutoDetectParser();
ContentHandler handler = new ToXMLContentHandler();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
try (InputStream is = getResourceAsStream(TEST_FILE1)) {
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
p.parse(is, handler, metadata, parseContext);
}
String xml = handler.toString();
//just includes headers for embedded documents
assertContains("<table name=\"my_table1\"><thead><tr>", xml);
assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml);
//but no other content
assertNotContained("dog", xml);
assertNotContained("alt=\"image1.png\"", xml);
//second embedded doc's image tag
assertNotContained("alt=\"A description...\"", xml);
}
use of org.apache.tika.sax.ToXMLContentHandler in project ddf by codice.
the class TikaInputTransformer method transform.
@Override
public Metacard transform(InputStream input, String id) throws IOException, CatalogTransformerException {
LOGGER.debug("Transforming input stream using Tika.");
if (input == null) {
throw new CatalogTransformerException("Cannot transform null input.");
}
try (TemporaryFileBackedOutputStream fileBackedOutputStream = new TemporaryFileBackedOutputStream()) {
try {
IOUtils.copy(input, fileBackedOutputStream);
} catch (IOException e) {
throw new CatalogTransformerException("Could not copy bytes of content message.", e);
}
Parser parser = new AutoDetectParser();
ToXMLContentHandler xmlContentHandler = new ToXMLContentHandler();
ToTextContentHandler textContentHandler = null;
ContentHandler contentHandler;
if (!contentMetadataExtractors.isEmpty()) {
textContentHandler = new ToTextContentHandler();
contentHandler = new TeeContentHandler(xmlContentHandler, textContentHandler);
} else {
contentHandler = xmlContentHandler;
}
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
Metadata metadata;
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
metadata = tikaMetadataExtractor.parseMetadata(inputStreamCopy, new ParseContext());
}
String metadataText = xmlContentHandler.toString();
if (templates != null) {
metadataText = transformToXml(metadataText);
}
String metacardContentType = metadata.get(Metadata.CONTENT_TYPE);
MetacardType metacardType = getMetacardTypeFromMimeType(metacardContentType);
if (metacardType == null) {
metacardType = commonTikaMetacardType;
}
Metacard metacard;
if (textContentHandler != null) {
String plainText = textContentHandler.toString();
Set<AttributeDescriptor> attributes = contentMetadataExtractors.values().stream().map(ContentMetadataExtractor::getMetacardAttributes).flatMap(Collection::stream).collect(Collectors.toSet());
MetacardTypeImpl extendedMetacardType = new MetacardTypeImpl(metacardType.getName(), metacardType, attributes);
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, extendedMetacardType);
for (ContentMetadataExtractor contentMetadataExtractor : contentMetadataExtractors.values()) {
contentMetadataExtractor.process(plainText, metacard);
}
} else {
metacard = MetacardCreator.createMetacard(metadata, id, metadataText, metacardType);
}
if (StringUtils.isNotBlank(metacardContentType)) {
metacard.setAttribute(new AttributeImpl(Core.DATATYPE, getDatatype(metacardContentType)));
}
if (StringUtils.startsWith(metacardContentType, "image")) {
try (InputStream inputStreamCopy = fileBackedOutputStream.asByteSource().openStream()) {
createThumbnail(inputStreamCopy, metacard);
}
}
LOGGER.debug("Finished transforming input stream using Tika.");
return metacard;
}
}
Aggregations