use of org.apache.tika.parser.ParseContext in project tika by apache.
the class TestParsers method testEXCELExtraction.
@Test
public void testEXCELExtraction() throws Exception {
final String expected = "Numbers and their Squares";
File file = getResourceAsFile("/test-documents/testEXCEL.xls");
String s1 = tika.parseToString(file);
assertTrue("Text does not contain '" + expected + "'", s1.contains(expected));
Parser parser = tika.getParser();
Metadata metadata = new Metadata();
try (InputStream stream = new FileInputStream(file)) {
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
}
assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class TestParsers method testWORDxtraction.
@Test
public void testWORDxtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testWORD.doc");
Parser parser = tika.getParser();
Metadata metadata = new Metadata();
try (InputStream stream = new FileInputStream(file)) {
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
}
assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class EmbeddedDocumentUtilTest method testAutomaticAdditionOfAutoDetectParserIfForgotten.
@Test
public void testAutomaticAdditionOfAutoDetectParserIfForgotten() throws Exception {
String needle = "When in the Course";
//TIKA-2096
TikaTest.XMLResult xmlResult = getXML("test_recursive_embedded.doc", new ParseContext());
assertContains(needle, xmlResult.xml);
ParseContext context = new ParseContext();
context.set(Parser.class, new EmptyParser());
xmlResult = getXML("test_recursive_embedded.doc", context);
assertNotContained(needle, xmlResult.xml);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class MyFirstTika method parseUsingAutoDetect.
public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
System.out.println("Handling using AutoDetectParser: [" + filename + "]");
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
ContentHandler handler = new BodyContentHandler();
TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
parser.parse(stream, handler, metadata, new ParseContext());
return handler.toString();
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class ParsingExample method parseEmbeddedExample.
/**
* This example shows how to extract content from the outer document and all
* embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}.
*
* @return content, including from embedded documents
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public String parseEmbeddedExample() throws IOException, SAXException, TikaException {
AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
parser.parse(stream, handler, metadata, context);
return handler.toString();
}
}
Aggregations