use of org.apache.tika.config.TikaConfig in project tika by apache.
the class PDFParserTest method testConfiguringMoreParams.
@Test
public void testConfiguringMoreParams() throws Exception {
try (InputStream configIs = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-inline-config.xml")) {
assertNotNull(configIs);
TikaConfig tikaConfig = new TikaConfig(configIs);
AutoDetectParser p = new AutoDetectParser(tikaConfig);
//make absolutely certain the functionality works!
List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", p);
assertEquals(2, metadata.size());
Map<MediaType, Parser> parsers = p.getParsers();
Parser composite = parsers.get(MediaType.application("pdf"));
Parser pdfParser = ((CompositeParser) composite).getParsers().get(MediaType.application("pdf"));
assertTrue(pdfParser instanceof PDFParser);
PDFParserConfig pdfParserConfig = ((PDFParser) pdfParser).getPDFParserConfig();
assertEquals(new AccessChecker(true), pdfParserConfig.getAccessChecker());
assertEquals(true, pdfParserConfig.getExtractInlineImages());
assertEquals(false, pdfParserConfig.getExtractUniqueInlineImagesOnly());
assertEquals(314, pdfParserConfig.getOcrDPI());
assertEquals(2.1f, pdfParserConfig.getOcrImageQuality(), .01f);
assertEquals("jpeg", pdfParserConfig.getOcrImageFormatName());
assertEquals(false, pdfParserConfig.getCatchIntermediateIOExceptions());
}
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class RegexNERecogniserTest method testGetEntityTypes.
@Test
public void testGetEntityTypes() throws Exception {
String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday";
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName());
Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
Set<String> days = new HashSet<>(Arrays.asList(md.getValues("NER_WEEK_DAY")));
assertTrue(days.contains("Sunday"));
assertTrue(days.contains("MONDAY"));
assertTrue(days.contains("Saturday"));
//and nothing else
assertTrue(days.size() == 3);
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class PDFParserTest method testInitializationOfNonPrimitivesViaConfig.
@Test
public void testInitializationOfNonPrimitivesViaConfig() throws Exception {
InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-config-non-primitives.xml");
assertNotNull(is);
TikaConfig tikaConfig = new TikaConfig(is);
AutoDetectParser p = new AutoDetectParser(tikaConfig);
Map<MediaType, Parser> parsers = p.getParsers();
Parser composite = parsers.get(MediaType.application("pdf"));
Parser pdfParser = ((CompositeParser) composite).getParsers().get(MediaType.application("pdf"));
assertEquals("org.apache.tika.parser.pdf.PDFParser", pdfParser.getClass().getName());
assertEquals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY, ((PDFParser) pdfParser).getPDFParserConfig().getOcrStrategy());
assertEquals(ImageType.RGB, ((PDFParser) pdfParser).getPDFParserConfig().getOcrImageType());
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class PDFParserTest method testInitializationViaConfig.
@Test
public void testInitializationViaConfig() throws Exception {
InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-config.xml");
assertNotNull(is);
TikaConfig tikaConfig = new TikaConfig(is);
Parser p = new AutoDetectParser(tikaConfig);
String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p);
text = text.replaceAll("\\s+", " ");
// Column text is now interleaved:
assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text);
}
use of org.apache.tika.config.TikaConfig in project nutch by apache.
the class TikaParser method setConf.
public void setConf(Configuration conf) {
this.conf = conf;
this.tikaConfig = null;
// do we want a custom Tika configuration file
// deprecated since Tika 0.7 which is based on
// a service provider based configuration
String customConfFile = conf.get("tika.config.file");
if (customConfFile != null) {
try {
// see if a Tika config file can be found in the job file
URL customTikaConfig = conf.getResource(customConfFile);
if (customTikaConfig != null)
tikaConfig = new TikaConfig(customTikaConfig, this.getClass().getClassLoader());
} catch (Exception e1) {
String message = "Problem loading custom Tika configuration from " + customConfFile;
LOG.error(message, e1);
}
} else {
try {
tikaConfig = new TikaConfig(this.getClass().getClassLoader());
} catch (Exception e2) {
String message = "Problem loading default Tika configuration";
LOG.error(message, e2);
}
}
// use a custom htmlmapper
String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
if (StringUtils.isNotBlank(htmlmapperClassName)) {
try {
Class HTMLMapperClass = Class.forName(htmlmapperClassName);
boolean interfaceOK = HtmlMapper.class.isAssignableFrom(HTMLMapperClass);
if (!interfaceOK) {
throw new RuntimeException("Class " + htmlmapperClassName + " does not implement HtmlMapper");
}
HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
} catch (Exception e) {
LOG.error("Can't generate instance for class " + htmlmapperClassName);
throw new RuntimeException("Can't generate instance for class " + htmlmapperClassName);
}
}
this.htmlParseFilters = new HtmlParseFilters(getConf());
this.utils = new DOMContentUtils(conf);
this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", Nutch.CACHING_FORBIDDEN_CONTENT);
this.upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names", true);
}
Aggregations