use of org.apache.tika.parser.EmptyParser in project tika by apache.
the class EmbeddedDocumentUtilTest method testAutomaticAdditionOfAutoDetectParserIfForgotten.
@Test
public void testAutomaticAdditionOfAutoDetectParserIfForgotten() throws Exception {
String needle = "When in the Course";
//TIKA-2096
TikaTest.XMLResult xmlResult = getXML("test_recursive_embedded.doc", new ParseContext());
assertContains(needle, xmlResult.xml);
ParseContext context = new ParseContext();
context.set(Parser.class, new EmptyParser());
xmlResult = getXML("test_recursive_embedded.doc", context);
assertNotContained(needle, xmlResult.xml);
}
use of org.apache.tika.parser.EmptyParser in project tika by apache.
the class ODFParserTest method getNonRecursingParseContext.
private ParseContext getNonRecursingParseContext() {
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
return parseContext;
}
use of org.apache.tika.parser.EmptyParser in project tika by apache.
the class SQLite3ParserTest method testNotAddingEmbeddedParserToParseContext.
//test what happens if the user does not want embedded docs handled
@Test
public void testNotAddingEmbeddedParserToParseContext() throws Exception {
Parser p = new AutoDetectParser();
ContentHandler handler = new ToXMLContentHandler();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
try (InputStream is = getResourceAsStream(TEST_FILE1)) {
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
p.parse(is, handler, metadata, parseContext);
}
String xml = handler.toString();
//just includes headers for embedded documents
assertContains("<table name=\"my_table1\"><thead><tr>", xml);
assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml);
//but no other content
assertNotContained("dog", xml);
assertNotContained("alt=\"image1.png\"", xml);
//second embedded doc's image tag
assertNotContained("alt=\"A description...\"", xml);
}
use of org.apache.tika.parser.EmptyParser in project tika by apache.
the class OOXMLParserTest method testBatch.
//@Test //use this for lightweight benchmarking to compare xwpf options
public void testBatch() throws Exception {
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setUseSAXDocxExtractor(true);
long started = new Date().getTime();
int ex = 0;
for (int i = 0; i < 100; i++) {
for (File f : getResourceAsFile("/test-documents").listFiles()) {
if (!f.getName().endsWith(".docx")) {
continue;
}
try (InputStream is = TikaInputStream.get(f)) {
ParseContext parseContext = new ParseContext();
parseContext.set(OfficeParserConfig.class, officeParserConfig);
//test only the extraction of the main docx content, not embedded docs
parseContext.set(Parser.class, new EmptyParser());
Metadata metadata = new Metadata();
XMLResult r = getXML(is, parser, metadata, parseContext);
} catch (Exception e) {
ex++;
}
}
}
System.out.println("elapsed: " + (new Date().getTime() - started) + " with " + ex + " exceptions");
}
use of org.apache.tika.parser.EmptyParser in project tika by apache.
the class ParsingExample method parseNoEmbeddedExample.
/**
* If you don't want content from embedded documents, send in
* a {@link org.apache.tika.parser.ParseContext} that does contains a
* {@link EmptyParser}.
*
* @return The content of a file.
*/
public String parseNoEmbeddedExample() throws IOException, SAXException, TikaException {
AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
parser.parse(stream, handler, metadata, parseContext);
return handler.toString();
}
}
Aggregations