Search in sources :

Example 56 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class PDFParserTest method testConfiguringMoreParams.

@Test
public void testConfiguringMoreParams() throws Exception {
    try (InputStream configIs = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-inline-config.xml")) {
        assertNotNull(configIs);
        TikaConfig tikaConfig = new TikaConfig(configIs);
        AutoDetectParser p = new AutoDetectParser(tikaConfig);
        //make absolutely certain the functionality works!
        List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p);
        assertEquals(2, metadata.size());
        Map<MediaType, Parser> parsers = p.getParsers();
        Parser composite = parsers.get(MediaType.application("pdf"));
        Parser pdfParser = ((CompositeParser) composite).getParsers().get(MediaType.application("pdf"));
        assertTrue(pdfParser instanceof PDFParser);
        PDFParserConfig pdfParserConfig = ((PDFParser) pdfParser).getPDFParserConfig();
        assertEquals(new AccessChecker(true), pdfParserConfig.getAccessChecker());
        assertEquals(true, pdfParserConfig.getExtractInlineImages());
        assertEquals(false, pdfParserConfig.getExtractUniqueInlineImagesOnly());
        assertEquals(314, pdfParserConfig.getOcrDPI());
        assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f);
        assertEquals("jpeg", pdfParserConfig.getOcrImageFormatName());
        assertEquals(false, pdfParserConfig.getCatchIntermediateIOExceptions());
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 57 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class RegexNERecogniserTest method testGetEntityTypes.

@Test
public void testGetEntityTypes() throws Exception {
    String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday";
    System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName());
    Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
    Metadata md = new Metadata();
    tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
    Set<String> days = new HashSet<>(Arrays.asList(md.getValues("NER_WEEK_DAY")));
    assertTrue(days.contains("Sunday"));
    assertTrue(days.contains("MONDAY"));
    assertTrue(days.contains("Saturday"));
    //and nothing else
    assertTrue(days.size() == 3);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 58 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class PDFParserTest method testInitializationOfNonPrimitivesViaConfig.

@Test
public void testInitializationOfNonPrimitivesViaConfig() throws Exception {
    InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-config-non-primitives.xml");
    assertNotNull(is);
    TikaConfig tikaConfig = new TikaConfig(is);
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    Map<MediaType, Parser> parsers = p.getParsers();
    Parser composite = parsers.get(MediaType.application("pdf"));
    Parser pdfParser = ((CompositeParser) composite).getParsers().get(MediaType.application("pdf"));
    assertEquals("org.apache.tika.parser.pdf.PDFParser", pdfParser.getClass().getName());
    assertEquals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY, ((PDFParser) pdfParser).getPDFParserConfig().getOcrStrategy());
    assertEquals(ImageType.RGB, ((PDFParser) pdfParser).getPDFParserConfig().getOcrImageType());
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 59 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class PDFParserTest method testInitializationViaConfig.

@Test
public void testInitializationViaConfig() throws Exception {
    InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-config.xml");
    assertNotNull(is);
    TikaConfig tikaConfig = new TikaConfig(is);
    Parser p = new AutoDetectParser(tikaConfig);
    String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p);
    text = text.replaceAll("\\s+", " ");
    // Column text is now interleaved:
    assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 60 with TikaConfig

use of org.apache.tika.config.TikaConfig in project nutch by apache.

the class TikaParser method setConf.

public void setConf(Configuration conf) {
    this.conf = conf;
    this.tikaConfig = null;
    // do we want a custom Tika configuration file
    // deprecated since Tika 0.7 which is based on
    // a service provider based configuration
    String customConfFile = conf.get("tika.config.file");
    if (customConfFile != null) {
        try {
            // see if a Tika config file can be found in the job file
            URL customTikaConfig = conf.getResource(customConfFile);
            if (customTikaConfig != null)
                tikaConfig = new TikaConfig(customTikaConfig, this.getClass().getClassLoader());
        } catch (Exception e1) {
            String message = "Problem loading custom Tika configuration from " + customConfFile;
            LOG.error(message, e1);
        }
    } else {
        try {
            tikaConfig = new TikaConfig(this.getClass().getClassLoader());
        } catch (Exception e2) {
            String message = "Problem loading default Tika configuration";
            LOG.error(message, e2);
        }
    }
    // use a custom htmlmapper
    String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
    if (StringUtils.isNotBlank(htmlmapperClassName)) {
        try {
            Class HTMLMapperClass = Class.forName(htmlmapperClassName);
            boolean interfaceOK = HtmlMapper.class.isAssignableFrom(HTMLMapperClass);
            if (!interfaceOK) {
                throw new RuntimeException("Class " + htmlmapperClassName + " does not implement HtmlMapper");
            }
            HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
        } catch (Exception e) {
            LOG.error("Can't generate instance for class " + htmlmapperClassName);
            throw new RuntimeException("Can't generate instance for class " + htmlmapperClassName);
        }
    }
    this.htmlParseFilters = new HtmlParseFilters(getConf());
    this.utils = new DOMContentUtils(conf);
    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", Nutch.CACHING_FORBIDDEN_CONTENT);
    this.upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names", true);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) HtmlParseFilters(org.apache.nutch.parse.HtmlParseFilters) URL(java.net.URL) MalformedURLException(java.net.MalformedURLException)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5