Search in sources :

Example 51 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class AbstractPkgTest method setUp.

@Before
public void setUp() throws Exception {
    tracker = new EmbeddedTrackingParser();
    trackingContext = new ParseContext();
    trackingContext.set(Parser.class, tracker);
    autoDetectParser = new AutoDetectParser();
    recursingContext = new ParseContext();
    recursingContext.set(Parser.class, autoDetectParser);
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Before(org.junit.Before)

Example 52 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class PDFParserTest method testInlineSelector.

@Test
public void testInlineSelector() throws Exception {
    PDFParserConfig config = new PDFParserConfig();
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);
    ParseContext context = new ParseContext();
    context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
    context.set(org.apache.tika.parser.Parser.class, new AutoDetectParser());
    List<Metadata> metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf", context);
    int inline = 0;
    int attach = 0;
    for (Metadata m : metadatas) {
        String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
        if (v != null) {
            if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                inline++;
            } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                attach++;
            }
        }
    }
    assertEquals(2, inline);
    assertEquals(2, attach);
    //now try turning off inline
    context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector());
    inline = 0;
    attach = 0;
    metadatas = getRecursiveMetadata("testPDF_childAttachments.pdf", context);
    for (Metadata m : metadatas) {
        String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
        if (v != null) {
            if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())) {
                inline++;
            } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())) {
                attach++;
            }
        }
    }
    assertEquals(0, inline);
    assertEquals(2, attach);
}
Also used : Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 53 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class PDFParserTest method testLegacyAccessChecking.

//Access checker tests
@Test
public void testLegacyAccessChecking() throws Exception {
    //test that default behavior doesn't throw AccessPermissionException
    for (String file : new String[] { "testPDF_no_extract_no_accessibility_owner_empty.pdf", "testPDF_no_extract_yes_accessibility_owner_empty.pdf" }) {
        String xml = getXML(file).xml;
        assertContains("Hello World", xml);
    }
    //now try with the user password
    PasswordProvider provider = new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "user";
        }
    };
    ParseContext context = new ParseContext();
    context.set(PasswordProvider.class, provider);
    Parser parser = new AutoDetectParser();
    for (String path : new String[] { "testPDF_no_extract_no_accessibility_owner_user.pdf", "testPDF_no_extract_yes_accessibility_owner_user.pdf" }) {
        assertContains("Hello World", getXML(path, context).xml);
    }
}
Also used : Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PasswordProvider(org.apache.tika.parser.PasswordProvider) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 54 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class PDFParserTest method testDisableAutoSpace.

@Test
public void testDisableAutoSpace() throws Exception {
    PDFParser parser = new PDFParser();
    parser.getPDFParserConfig().setEnableAutoSpace(false);
    XMLResult r = getXML("testExtraSpaces.pdf", parser);
    String content = r.xml.replaceAll("[\\s ]+", " ");
    // Text is correct when autoSpace is off:
    assertContains("Here is some formatted text", content);
    parser.getPDFParserConfig().setEnableAutoSpace(true);
    r = getXML("testExtraSpaces.pdf", parser);
    content = r.xml.replaceAll("[\\s ]+", " ");
    // Text is correct when autoSpace is off:
    // Text has extra spaces when autoSpace is on
    assertEquals(-1, content.indexOf("Here is some formatted text"));
    //now try with autodetect
    Parser autoParser = new AutoDetectParser();
    ParseContext context = new ParseContext();
    PDFParserConfig config = new PDFParserConfig();
    context.set(PDFParserConfig.class, config);
    //default is true
    r = getXML("testExtraSpaces.pdf", autoParser, context);
    content = r.xml.replaceAll("[\\s ]+", " ");
    // Text has extra spaces when autoSpace is on
    assertEquals(-1, content.indexOf("Here is some formatted text"));
    config.setEnableAutoSpace(false);
    r = getXML("testExtraSpaces.pdf", parser, context);
    content = r.xml.replaceAll("[\\s ]+", " ");
    // Text is correct when autoSpace is off:
    assertContains("Here is some formatted text", content);
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 55 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ArParserTest method testArParsing.

@Test
public void testArParsing() throws Exception {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofText.ar")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }
    assertEquals("application/x-archive", metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("testTXT.txt", content);
    assertContains("Test d'indexation de Txt", content);
    assertContains("http://www.apache.org", content);
    try (InputStream stream = ArParserTest.class.getResourceAsStream("/test-documents/testARofSND.ar")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }
    assertEquals("application/x-archive", metadata.get(Metadata.CONTENT_TYPE));
    content = handler.toString();
    assertContains("testAU.au", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)167 Metadata (org.apache.tika.metadata.Metadata)139 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)112 ParseContext (org.apache.tika.parser.ParseContext)104 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)97 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 TikaException (org.apache.tika.exception.TikaException)13 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12