use of org.apache.tika.Tika in project tika by apache.
the class NamedEntityParserTest method testNerChain.
@Test
public void testNerChain() throws Exception {
String classNames = OpenNLPNERecogniser.class.getName() + "," + RegexNERecogniser.class.getName();
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
Tika tika = new Tika(config);
String text = "University of Southern California (USC), is located in Los Angeles ." + " Campus is busy from monday to saturday";
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
assumeTrue(keys.contains("NER_WEEK_DAY"));
assumeTrue(keys.contains("NER_LOCATION"));
}
use of org.apache.tika.Tika in project tika by apache.
the class NamedEntityParserTest method testParse.
@Test
public void testParse() throws Exception {
//test config is added to resources directory
TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
Tika tika = new Tika(config);
String text = "I am student at University of Southern California (USC)," + " located in Los Angeles . USC's football team is called by name Trojans." + " Mr. John McKay was a head coach of the team from 1960 - 1975";
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
HashSet<String> set = new HashSet<String>();
set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
assumeTrue(set.contains(NamedEntityParser.class.getName()));
set.clear();
set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
assumeTrue(set.contains("John McKay"));
set.clear();
set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
assumeTrue(set.contains("Los Angeles"));
set.clear();
set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
assumeTrue(set.contains("University of Southern California"));
set.clear();
set.addAll(Arrays.asList(md.getValues("NER_DATE")));
assumeTrue(set.contains("1960 - 1975"));
}
use of org.apache.tika.Tika in project tika by apache.
the class SentimentParserTest method endToEndTest.
@Test
public void endToEndTest() throws Exception {
Tika tika = getTika("tika-config-sentiment-opennlp.xml");
if (tika == null) {
return;
}
String text = "What a wonderful thought it is that" + " some of the best days of our lives haven't happened yet.";
ByteArrayInputStream stream = new ByteArrayInputStream(text.getBytes(Charset.defaultCharset()));
Metadata md = new Metadata();
tika.parse(stream, md);
String sentiment = md.get("Sentiment");
assertNotNull(sentiment);
assertEquals("positive", sentiment);
}
use of org.apache.tika.Tika in project tika by apache.
the class ProbabilisticMimeDetectionTestWithTika method setUp.
/** @inheritDoc */
@Before
public void setUp() {
MimeTypes types = MimeTypes.getDefaultMimeTypes();
ServiceLoader loader = new ServiceLoader();
registry = types.getMediaTypeRegistry();
/*
* here is an example with the use of the builder to
* instantiate the object.
*/
Builder builder = new ProbabilisticMimeDetectionSelector.Builder();
proSelector = new ProbabilisticMimeDetectionSelector(types, builder.priorMagicFileType(0.5f).priorExtensionFileType(0.5f).priorMetaFileType(0.5f));
DefaultProbDetector detector = new DefaultProbDetector(proSelector, loader);
// Use a default Tika, except for our different detector
tika = new Tika(detector);
}
use of org.apache.tika.Tika in project tika by apache.
the class NamedEntityParser method initialize.
private synchronized void initialize(ParseContext context) {
if (initialized) {
return;
}
initialized = true;
//TODO: read class name from context or config
//There can be multiple classes in the form of comma separated class names;
String classNamesString = System.getProperty(SYS_PROP_NER_IMPL, DEFAULT_NER_IMPL);
String[] classNames = classNamesString.split(",");
this.nerChain = new ArrayList<>(classNames.length);
for (String className : classNames) {
className = className.trim();
LOG.info("going to load, instantiate and bind the instance of {}", className);
try {
NERecogniser recogniser = (NERecogniser) Class.forName(className).newInstance();
LOG.info("{} is available ? {}", className, recogniser.isAvailable());
if (recogniser.isAvailable()) {
nerChain.add(recogniser);
}
} catch (Exception e) {
LOG.error(e.getMessage(), e);
}
}
try {
TikaConfig config = new TikaConfig();
this.secondaryParser = new Tika(config);
this.available = !nerChain.isEmpty();
LOG.info("Number of NERecognisers in chain {}", nerChain.size());
} catch (Exception e) {
LOG.error(e.getMessage(), e);
this.available = false;
}
}
Aggregations