use of org.apache.tika.parser.ParseContext in project tika by apache.
the class BundleIT method testForkParser.
@Test
public void testForkParser() throws Exception {
ForkParser parser = new ForkParser(Activator.class.getClassLoader(), defaultParser);
String data = "<!DOCTYPE html>\n<html><body><p>test <span>content</span></p></body></html>";
InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
Writer writer = new StringWriter();
ContentHandler contentHandler = new BodyContentHandler(writer);
Metadata metadata = new Metadata();
MediaType type = contentTypeDetector.detect(stream, metadata);
assertEquals(type.toString(), "text/html");
metadata.add(Metadata.CONTENT_TYPE, type.toString());
ParseContext parseCtx = new ParseContext();
parser.parse(stream, contentHandler, metadata, parseCtx);
writer.flush();
String content = writer.toString();
assertTrue(content.length() > 0);
assertEquals("test content", content.trim());
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class BundleIT method testTikaBundle.
@Test
public void testTikaBundle() throws Exception {
Tika tika = new Tika();
// Package extraction
ContentHandler handler = new BodyContentHandler();
Parser parser = tika.getParser();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
try (InputStream stream = new FileInputStream("src/test/resources/test-documents.zip")) {
parser.parse(stream, handler, new Metadata(), context);
}
String content = handler.toString();
assertTrue(content.contains("testEXCEL.xls"));
assertTrue(content.contains("Sample Excel Worksheet"));
assertTrue(content.contains("testHTML.html"));
assertTrue(content.contains("Test Indexation Html"));
assertTrue(content.contains("testOpenOffice2.odt"));
assertTrue(content.contains("This is a sample Open Office document"));
assertTrue(content.contains("testPDF.pdf"));
assertTrue(content.contains("Apache Tika"));
assertTrue(content.contains("testPPT.ppt"));
assertTrue(content.contains("Sample Powerpoint Slide"));
assertTrue(content.contains("testRTF.rtf"));
assertTrue(content.contains("indexation Word"));
assertTrue(content.contains("testTXT.txt"));
assertTrue(content.contains("Test d'indexation de Txt"));
assertTrue(content.contains("testWORD.doc"));
assertTrue(content.contains("This is a sample Microsoft Word Document"));
assertTrue(content.contains("testXML.xml"));
assertTrue(content.contains("Rida Benjelloun"));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class TesseractOCRParser method extractHOCROutput.
private void extractHOCROutput(InputStream is, ParseContext parseContext, XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
if (parseContext == null) {
parseContext = new ParseContext();
}
SAXParser parser = parseContext.getSAXParser();
xhtml.startElement("div", "class", "ocr");
parser.parse(is, new OfflineContentHandler(new HOCRPassThroughHandler(xhtml)));
xhtml.endElement("div");
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class TikaConfigSerializer method serialize.
/**
*
* @param config config to serialize
* @param mode serialization mode
* @param writer writer
* @param charset charset
* @throws Exception
*/
public static void serialize(TikaConfig config, Mode mode, Writer writer, Charset charset) throws Exception {
DocumentBuilder docBuilder = new ParseContext().getDocumentBuilder();
// root elements
Document doc = docBuilder.newDocument();
Element rootElement = doc.createElement("properties");
doc.appendChild(rootElement);
addMimeComment(mode, rootElement, doc);
addServiceLoader(mode, rootElement, doc, config);
addExecutorService(mode, rootElement, doc, config);
addEncodingDetectors(mode, rootElement, doc, config);
addTranslator(mode, rootElement, doc, config);
addDetectors(mode, rootElement, doc, config);
addParsers(mode, rootElement, doc, config);
// TODO Service Loader section
// now write
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
transformer.setOutputProperty(OutputKeys.ENCODING, charset.name());
DOMSource source = new DOMSource(doc);
StreamResult result = new StreamResult(writer);
transformer.transform(source, result);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class ImageMetadataExtractor method parseRawXMP.
public void parseRawXMP(byte[] xmpData) throws IOException, SAXException, TikaException {
XMPMetadata xmp = null;
try (InputStream decoded = new ByteArrayInputStream(xmpData)) {
Document dom = new ParseContext().getDocumentBuilder().parse(decoded);
if (dom != null) {
xmp = new XMPMetadata(dom);
}
} catch (IOException | SAXException e) {
//
}
if (xmp != null) {
JempboxExtractor.extractDublinCore(xmp, metadata);
JempboxExtractor.extractXMPMM(xmp, metadata);
}
}
Aggregations