Search in sources :

Example 41 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class NamedEntityParserTest method testNerChain.

@Test
public void testNerChain() throws Exception {
    String classNames = OpenNLPNERecogniser.class.getName() + "," + RegexNERecogniser.class.getName();
    System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
    TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
    Tika tika = new Tika(config);
    String text = "University of Southern California (USC), is located in Los Angeles ." + " Campus is busy from monday to saturday";
    Metadata md = new Metadata();
    tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
    HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
    assumeTrue(keys.contains("NER_WEEK_DAY"));
    assumeTrue(keys.contains("NER_LOCATION"));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) RegexNERecogniser(org.apache.tika.parser.ner.regex.RegexNERecogniser) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) HashSet(java.util.HashSet) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 42 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class NamedEntityParserTest method testParse.

@Test
public void testParse() throws Exception {
    //test config is added to resources directory
    TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
    Tika tika = new Tika(config);
    String text = "I am student at University of Southern California (USC)," + " located in Los Angeles . USC's football team is called by name Trojans." + " Mr. John McKay was a head coach of the team from 1960 - 1975";
    Metadata md = new Metadata();
    tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
    HashSet<String> set = new HashSet<String>();
    set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
    assumeTrue(set.contains(NamedEntityParser.class.getName()));
    set.clear();
    set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
    assumeTrue(set.contains("John McKay"));
    set.clear();
    set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
    assumeTrue(set.contains("Los Angeles"));
    set.clear();
    set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
    assumeTrue(set.contains("University of Southern California"));
    set.clear();
    set.addAll(Arrays.asList(md.getValues("NER_DATE")));
    assumeTrue(set.contains("1960 - 1975"));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) HashSet(java.util.HashSet) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 43 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class SentimentParserTest method endToEndTest.

@Test
public void endToEndTest() throws Exception {
    Tika tika = getTika("tika-config-sentiment-opennlp.xml");
    if (tika == null) {
        return;
    }
    String text = "What a wonderful thought it is that" + " some of the best days of our lives haven't happened yet.";
    ByteArrayInputStream stream = new ByteArrayInputStream(text.getBytes(Charset.defaultCharset()));
    Metadata md = new Metadata();
    tika.parse(stream, md);
    String sentiment = md.get("Sentiment");
    assertNotNull(sentiment);
    assertEquals("positive", sentiment);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) Test(org.junit.Test)

Example 44 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class ProbabilisticMimeDetectionTestWithTika method setUp.

/** @inheritDoc */
@Before
public void setUp() {
    MimeTypes types = MimeTypes.getDefaultMimeTypes();
    ServiceLoader loader = new ServiceLoader();
    registry = types.getMediaTypeRegistry();
    /*
         * here is an example with the use of the builder to
         * instantiate the object.
         */
    Builder builder = new ProbabilisticMimeDetectionSelector.Builder();
    proSelector = new ProbabilisticMimeDetectionSelector(types, builder.priorMagicFileType(0.5f).priorExtensionFileType(0.5f).priorMetaFileType(0.5f));
    DefaultProbDetector detector = new DefaultProbDetector(proSelector, loader);
    // Use a default Tika, except for our different detector
    tika = new Tika(detector);
}
Also used : ServiceLoader(org.apache.tika.config.ServiceLoader) Builder(org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder) DefaultProbDetector(org.apache.tika.detect.DefaultProbDetector) Tika(org.apache.tika.Tika) Before(org.junit.Before)

Example 45 with Tika

use of org.apache.tika.Tika in project tika by apache.

the class NamedEntityParser method initialize.

private synchronized void initialize(ParseContext context) {
    if (initialized) {
        return;
    }
    initialized = true;
    //TODO: read class name from context or config
    //There can be multiple classes in the form of comma separated class names;
    String classNamesString = System.getProperty(SYS_PROP_NER_IMPL, DEFAULT_NER_IMPL);
    String[] classNames = classNamesString.split(",");
    this.nerChain = new ArrayList<>(classNames.length);
    for (String className : classNames) {
        className = className.trim();
        LOG.info("going to load, instantiate and bind the instance of {}", className);
        try {
            NERecogniser recogniser = (NERecogniser) Class.forName(className).newInstance();
            LOG.info("{} is available ? {}", className, recogniser.isAvailable());
            if (recogniser.isAvailable()) {
                nerChain.add(recogniser);
            }
        } catch (Exception e) {
            LOG.error(e.getMessage(), e);
        }
    }
    try {
        TikaConfig config = new TikaConfig();
        this.secondaryParser = new Tika(config);
        this.available = !nerChain.isEmpty();
        LOG.info("Number of NERecognisers in chain {}", nerChain.size());
    } catch (Exception e) {
        LOG.error(e.getMessage(), e);
        this.available = false;
    }
}
Also used : OpenNLPNERecogniser(org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser) RegexNERecogniser(org.apache.tika.parser.ner.regex.RegexNERecogniser) TikaConfig(org.apache.tika.config.TikaConfig) Tika(org.apache.tika.Tika) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException)

Aggregations

Tika (org.apache.tika.Tika)54 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)29 ByteArrayInputStream (java.io.ByteArrayInputStream)14 TikaTest (org.apache.tika.TikaTest)12 TikaConfig (org.apache.tika.config.TikaConfig)12 File (java.io.File)8 InputStream (java.io.InputStream)7 URL (java.net.URL)6 TikaInputStream (org.apache.tika.io.TikaInputStream)5 IOException (java.io.IOException)4 HashSet (java.util.HashSet)4 Ignore (org.junit.Ignore)4 FileInputStream (java.io.FileInputStream)3 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Content (org.apache.nutch.protocol.Content)3 Before (org.junit.Before)3 FileOutputStream (java.io.FileOutputStream)2 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2