use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class AbstractPkgTest method setUp.
@Before
public void setUp() throws Exception {
tracker = new EmbeddedTrackingParser();
trackingContext = new ParseContext();
trackingContext.set(Parser.class, tracker);
autoDetectParser = new AutoDetectParser();
recursingContext = new ParseContext();
recursingContext.set(Parser.class, autoDetectParser);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class PDFParserTest method testInlineSelector.
@Test
public void testInlineSelector() throws Exception {
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
ParseContext context = new ParseContext();
context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
context.set(org.apache.tika.parser.Parser.class, new AutoDetectParser());
List<Metadata> metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf", context);
int inline = 0;
int attach = 0;
for (Metadata m : metadatas) {
String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (v != null) {
if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
inline++;
} else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
attach++;
}
}
}
assertEquals(2, inline);
assertEquals(2, attach);
//now try turning off inline
context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector());
inline = 0;
attach = 0;
metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf", context);
for (Metadata m : metadatas) {
String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (v != null) {
if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
inline++;
} else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
attach++;
}
}
}
assertEquals(0, inline);
assertEquals(2, attach);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class PDFParserTest method testLegacyAccessChecking.
//Access checker tests
@Test
public void testLegacyAccessChecking() throws Exception {
//test that default behavior doesn't throw AccessPermissionException
for (String file : new String[] { "testPDF_no_extract_no_accessibility_owner_empty.pdf", "testPDF_no_extract_yes_accessibility_owner_empty.pdf" }) {
String xml = getXML(file).xml;
assertContains("Hello World", xml);
}
//now try with the user password
PasswordProvider provider = new PasswordProvider() {
@Override
public String getPassword(Metadata metadata) {
return "user";
}
};
ParseContext context = new ParseContext();
context.set(PasswordProvider.class, provider);
Parser parser = new AutoDetectParser();
for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_user.pdf", "testPDF_no_extract_yes_accessibility_owner_user.pdf" }) {
assertContains("Hello World", getXML(path, context).xml);
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class PDFParserTest method testDisableAutoSpace.
@Test
public void testDisableAutoSpace() throws Exception {
PDFParser parser = new PDFParser();
parser.getPDFParserConfig().setEnableAutoSpace(false);
XMLResult r = getXML("testExtraSpaces.pdf", parser);
String content = r.xml.replaceAll("[\\s ]+", " ");
// Text is correct when autoSpace is off:
assertContains("Here is some formatted text", content);
parser.getPDFParserConfig().setEnableAutoSpace(true);
r = getXML("testExtraSpaces.pdf", parser);
content = r.xml.replaceAll("[\\s ]+", " ");
// Text is correct when autoSpace is off:
// Text has extra spaces when autoSpace is on
assertEquals(-1, content.indexOf("Here is some formatted text"));
//now try with autodetect
Parser autoParser = new AutoDetectParser();
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
context.set(PDFParserConfig.class, config);
//default is true
r = getXML("testExtraSpaces.pdf", autoParser, context);
content = r.xml.replaceAll("[\\s ]+", " ");
// Text has extra spaces when autoSpace is on
assertEquals(-1, content.indexOf("Here is some formatted text"));
config.setEnableAutoSpace(false);
r = getXML("testExtraSpaces.pdf", parser, context);
content = r.xml.replaceAll("[\\s ]+", " ");
// Text is correct when autoSpace is off:
assertContains("Here is some formatted text", content);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ArParserTest method testArParsing.
@Test
public void testArParsing() throws Exception {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofText.ar")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals("application/x-archive", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("testTXT.txt", content);
assertContains("Test d'indexation de Txt", content);
assertContains("http://www.apache.org", content);
try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofSND.ar")) {
parser.parse(stream, handler, metadata, recursingContext);
}
assertEquals("application/x-archive", metadata.get(Metadata.CONTENT_TYPE));
content = handler.toString();
assertContains("testAU.au", content);
}
Aggregations