Search in sources :

Example 46 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class NamedEntityParserTest method testNerChain.

@Test
public void testNerChain() throws Exception {
    String classNames = OpenNLPNERecogniser.class.getName() + "," + RegexNERecogniser.class.getName();
    System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
    TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
    Tika tika = new Tika(config);
    String text = "University of Southern California (USC), is located in Los Angeles ." + " Campus is busy from monday to saturday";
    Metadata md = new Metadata();
    tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
    HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
    assumeTrue(keys.contains("NER_WEEK_DAY"));
    assumeTrue(keys.contains("NER_LOCATION"));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) RegexNERecogniser(org.apache.tika.parser.ner.regex.RegexNERecogniser) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) HashSet(java.util.HashSet) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 47 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class NamedEntityParserTest method testParse.

@Test
public void testParse() throws Exception {
    //test config is added to resources directory
    TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
    Tika tika = new Tika(config);
    String text = "I am student at University of Southern California (USC)," + " located in Los Angeles . USC's football team is called by name Trojans." + " Mr. John McKay was a head coach of the team from 1960 - 1975";
    Metadata md = new Metadata();
    tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
    HashSet<String> set = new HashSet<String>();
    set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
    assumeTrue(set.contains(NamedEntityParser.class.getName()));
    set.clear();
    set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
    assumeTrue(set.contains("John McKay"));
    set.clear();
    set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
    assumeTrue(set.contains("Los Angeles"));
    set.clear();
    set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
    assumeTrue(set.contains("University of Southern California"));
    set.clear();
    set.addAll(Arrays.asList(md.getValues("NER_DATE")));
    assumeTrue(set.contains("1960 - 1975"));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) HashSet(java.util.HashSet) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 48 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class WordParserTest method testMacros.

@Test
public void testMacros() throws Exception {
    //test default is "don't extract macros"
    for (Metadata metadata : getRecursiveMetadata("testWORD_macros.doc")) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    //now test that they were extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc", context);
    assertContainsAtLeast(minExpected, metadataList);
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    metadataList = getRecursiveMetadata("testWORD_macros.doc", parser);
    assertContainsAtLeast(minExpected, metadataList);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 49 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class CompositeParserTest method testMimeTypeAliases.

@Test
public void testMimeTypeAliases() throws Exception {
    MediaType bmpCanonical = MediaType.image("bmp");
    Map<String, String> bmpCanonicalMetadata = new HashMap<String, String>();
    bmpCanonicalMetadata.put("BMP", "True");
    bmpCanonicalMetadata.put("Canonical", "True");
    Parser bmpCanonicalParser = new DummyParser(new HashSet<MediaType>(Arrays.asList(bmpCanonical)), bmpCanonicalMetadata, null);
    MediaType bmpAlias = MediaType.image("x-ms-bmp");
    Map<String, String> bmpAliasMetadata = new HashMap<String, String>();
    bmpAliasMetadata.put("BMP", "True");
    bmpAliasMetadata.put("Alias", "True");
    Parser bmpAliasParser = new DummyParser(new HashSet<MediaType>(Arrays.asList(bmpAlias)), bmpAliasMetadata, null);
    TikaConfig config = TikaConfig.getDefaultConfig();
    CompositeParser canonical = new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser);
    CompositeParser alias = new CompositeParser(config.getMediaTypeRegistry(), bmpAliasParser);
    CompositeParser both = new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser, bmpAliasParser);
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata;
    // Canonical and Canonical
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
    canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Canonical"));
    // Alias and Alias
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
    alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Alias"));
    // Alias type and Canonical parser
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
    canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Canonical"));
    // Canonical type and Alias parser
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
    alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Alias"));
    // And when both are there, will go for the last one
    //  to be registered (which is the alias one)
    metadata = new Metadata();
    metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
    both.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
    assertEquals("True", metadata.get("BMP"));
    assertEquals("True", metadata.get("Alias"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaConfig(org.apache.tika.config.TikaConfig) HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) MediaType(org.apache.tika.mime.MediaType) Test(org.junit.Test)

Example 50 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class NamedEntityParser method initialize.

private synchronized void initialize(ParseContext context) {
    if (initialized) {
        return;
    }
    initialized = true;
    //TODO: read class name from context or config
    //There can be multiple classes in the form of comma separated class names;
    String classNamesString = System.getProperty(SYS_PROP_NER_IMPL, DEFAULT_NER_IMPL);
    String[] classNames = classNamesString.split(",");
    this.nerChain = new ArrayList<>(classNames.length);
    for (String className : classNames) {
        className = className.trim();
        LOG.info("going to load, instantiate and bind the instance of {}", className);
        try {
            NERecogniser recogniser = (NERecogniser) Class.forName(className).newInstance();
            LOG.info("{} is available ? {}", className, recogniser.isAvailable());
            if (recogniser.isAvailable()) {
                nerChain.add(recogniser);
            }
        } catch (Exception e) {
            LOG.error(e.getMessage(), e);
        }
    }
    try {
        TikaConfig config = new TikaConfig();
        this.secondaryParser = new Tika(config);
        this.available = !nerChain.isEmpty();
        LOG.info("Number of NERecognisers in chain {}", nerChain.size());
    } catch (Exception e) {
        LOG.error(e.getMessage(), e);
        this.available = false;
    }
}
Also used : OpenNLPNERecogniser(org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser) RegexNERecogniser(org.apache.tika.parser.ner.regex.RegexNERecogniser) TikaConfig(org.apache.tika.config.TikaConfig) Tika(org.apache.tika.Tika) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5