use of org.apache.tika.config.TikaConfig in project tika by apache.
the class NamedEntityParserTest method testNerChain.
@Test
public void testNerChain() throws Exception {
String classNames = OpenNLPNERecogniser.class.getName() + "," + RegexNERecogniser.class.getName();
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
Tika tika = new Tika(config);
String text = "University of Southern California (USC), is located in Los Angeles ." + " Campus is busy from monday to saturday";
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
assumeTrue(keys.contains("NER_WEEK_DAY"));
assumeTrue(keys.contains("NER_LOCATION"));
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class NamedEntityParserTest method testParse.
@Test
public void testParse() throws Exception {
//test config is added to resources directory
TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
Tika tika = new Tika(config);
String text = "I am student at University of Southern California (USC)," + " located in Los Angeles . USC's football team is called by name Trojans." + " Mr. John McKay was a head coach of the team from 1960 - 1975";
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
HashSet<String> set = new HashSet<String>();
set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
assumeTrue(set.contains(NamedEntityParser.class.getName()));
set.clear();
set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
assumeTrue(set.contains("John McKay"));
set.clear();
set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
assumeTrue(set.contains("Los Angeles"));
set.clear();
set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
assumeTrue(set.contains("University of Southern California"));
set.clear();
set.addAll(Arrays.asList(md.getValues("NER_DATE")));
assumeTrue(set.contains("1960 - 1975"));
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class WordParserTest method testMacros.
@Test
public void testMacros() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testWORD_macros.doc")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.doc", context);
assertContainsAtLeast(minExpected, metadataList);
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
metadataList = getRecursiveMetadata("testWORD_macros.doc", parser);
assertContainsAtLeast(minExpected, metadataList);
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class CompositeParserTest method testMimeTypeAliases.
@Test
public void testMimeTypeAliases() throws Exception {
MediaType bmpCanonical = MediaType.image("bmp");
Map<String, String> bmpCanonicalMetadata = new HashMap<String, String>();
bmpCanonicalMetadata.put("BMP", "True");
bmpCanonicalMetadata.put("Canonical", "True");
Parser bmpCanonicalParser = new DummyParser(new HashSet<MediaType>(Arrays.asList(bmpCanonical)), bmpCanonicalMetadata, null);
MediaType bmpAlias = MediaType.image("x-ms-bmp");
Map<String, String> bmpAliasMetadata = new HashMap<String, String>();
bmpAliasMetadata.put("BMP", "True");
bmpAliasMetadata.put("Alias", "True");
Parser bmpAliasParser = new DummyParser(new HashSet<MediaType>(Arrays.asList(bmpAlias)), bmpAliasMetadata, null);
TikaConfig config = TikaConfig.getDefaultConfig();
CompositeParser canonical = new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser);
CompositeParser alias = new CompositeParser(config.getMediaTypeRegistry(), bmpAliasParser);
CompositeParser both = new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser, bmpAliasParser);
ContentHandler handler = new BodyContentHandler();
Metadata metadata;
// Canonical and Canonical
metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
assertEquals("True", metadata.get("BMP"));
assertEquals("True", metadata.get("Canonical"));
// Alias and Alias
metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
assertEquals("True", metadata.get("BMP"));
assertEquals("True", metadata.get("Alias"));
// Alias type and Canonical parser
metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
assertEquals("True", metadata.get("BMP"));
assertEquals("True", metadata.get("Canonical"));
// Canonical type and Alias parser
metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
alias.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
assertEquals("True", metadata.get("BMP"));
assertEquals("True", metadata.get("Alias"));
// And when both are there, will go for the last one
// to be registered (which is the alias one)
metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
both.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
assertEquals("True", metadata.get("BMP"));
assertEquals("True", metadata.get("Alias"));
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class NamedEntityParser method initialize.
private synchronized void initialize(ParseContext context) {
if (initialized) {
return;
}
initialized = true;
//TODO: read class name from context or config
//There can be multiple classes in the form of comma separated class names;
String classNamesString = System.getProperty(SYS_PROP_NER_IMPL, DEFAULT_NER_IMPL);
String[] classNames = classNamesString.split(",");
this.nerChain = new ArrayList<>(classNames.length);
for (String className : classNames) {
className = className.trim();
LOG.info("going to load, instantiate and bind the instance of {}", className);
try {
NERecogniser recogniser = (NERecogniser) Class.forName(className).newInstance();
LOG.info("{} is available ? {}", className, recogniser.isAvailable());
if (recogniser.isAvailable()) {
nerChain.add(recogniser);
}
} catch (Exception e) {
LOG.error(e.getMessage(), e);
}
}
try {
TikaConfig config = new TikaConfig();
this.secondaryParser = new Tika(config);
this.available = !nerChain.isEmpty();
LOG.info("Number of NERecognisers in chain {}", nerChain.size());
} catch (Exception e) {
LOG.error(e.getMessage(), e);
this.available = false;
}
}
Aggregations