use of org.apache.tika.Tika in project tika by apache.
the class HtmlParserTest method testLineBreak.
/**
* Test case for HTML content like
* ">div<foo>br<bar>/div>" that should result
* in three whitespace-separated tokens "foo", "bar" and "baz" instead
* of a single token "foobarbaz".
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
*/
@Test
public void testLineBreak() throws Exception {
String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
String text = new Tika().parseToString(new ByteArrayInputStream(test.getBytes(US_ASCII)));
String[] parts = text.trim().split("\\s+");
assertEquals(3, parts.length);
assertEquals("foo", parts[0]);
assertEquals("bar", parts[1]);
assertEquals("baz", parts[2]);
}
use of org.apache.tika.Tika in project tika by apache.
the class HtmlParserTest method testCharactersDirectlyUnderBodyElement.
/**
* Test case for TIKA-210
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
*/
@Test
public void testCharactersDirectlyUnderBodyElement() throws Exception {
String test = "<html><body>test</body></html>";
String content = new Tika().parseToString(new ByteArrayInputStream(test.getBytes(UTF_8)));
assertEquals("test", content);
}
use of org.apache.tika.Tika in project tika by apache.
the class RegexNERecogniserTest method testGetEntityTypes.
@Test
public void testGetEntityTypes() throws Exception {
String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday";
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName());
Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
Set<String> days = new HashSet<>(Arrays.asList(md.getValues("NER_WEEK_DAY")));
assertTrue(days.contains("Sunday"));
assertTrue(days.contains("MONDAY"));
assertTrue(days.contains("Saturday"));
//and nothing else
assertTrue(days.size() == 3);
}
use of org.apache.tika.Tika in project Lucee by lucee.
the class IOUtil method getMimeType.
public static String getMimeType(Resource res, String defaultValue) {
Metadata md = new Metadata();
md.set(Metadata.RESOURCE_NAME_KEY, res.getName());
md.set(Metadata.CONTENT_LENGTH, Long.toString(res.length()));
InputStream is = null;
try {
Tika tika = new Tika();
String result = tika.detect(is = res.getInputStream(), md);
if (result.indexOf("tika") != -1) {
String tmp = ResourceUtil.EXT_MT.get(ResourceUtil.getExtension(res, "").toLowerCase());
if (!StringUtil.isEmpty(tmp))
return tmp;
}
return result;
} catch (Exception e) {
String tmp = ResourceUtil.EXT_MT.get(ResourceUtil.getExtension(res, "").toLowerCase());
if (!StringUtil.isEmpty(tmp))
return tmp;
return defaultValue;
} finally {
IOUtil.closeEL(is);
}
}
Aggregations