use of org.apache.tika.parser.ParseContext in project tika by apache.
the class EnviHeaderParserTest method testParseGlobalMetadata.
@Test
public void testParseGlobalMetadata() throws Exception {
if (System.getProperty("java.version").startsWith("1.5")) {
return;
}
Parser parser = new EnviHeaderParser();
ToXMLContentHandler handler = new ToXMLContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream = EnviHeaderParser.class.getResourceAsStream("/test-documents/envi_test_header.hdr")) {
assertNotNull("Test ENVI file not found", stream);
parser.parse(stream, handler, metadata, new ParseContext());
}
// Check content of test file
String content = handler.toString();
assertContains("<body><p>ENVI</p>", content);
assertContains("<p>samples = 2400</p>", content);
assertContains("<p>lines = 2400</p>", content);
assertContains("<p>map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, 5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}</p>", content);
assertContains("content=\"application/envi.hdr\"", content);
assertContains("projection info = {16, 6371007.2, 0.000000, 0.0, 0.0, Sinusoidal, units=Meters}", content);
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class SourceCodeParserTest method testSupportTypes.
@Test
public void testSupportTypes() throws Exception {
Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class HtmlParserTest method testDetectOfCharset.
/**
* Test case for TIKA-334
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
*/
@Test
public void testDetectOfCharset() throws Exception {
String test = "<html><head><title>Ž</title></head><body></body></html>";
Metadata metadata = new Metadata();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("Ž", metadata.get(TikaCoreProperties.TITLE));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class HtmlParserTest method testCustomHtmlSchema.
// TIKA-1193
@Test
public void testCustomHtmlSchema() throws Exception {
// Default schema does not allow tables inside anchors
String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
Metadata metadata = new Metadata();
LinkContentHandler linkContentHandler = new LinkContentHandler();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler, metadata, new ParseContext());
// Expect no anchor text
assertEquals("", linkContentHandler.getLinks().get(0).getText());
// We'll change the schema to allow tables inside anchors!
Schema schema = new HTMLSchema();
schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
ParseContext parseContext = new ParseContext();
parseContext.set(Schema.class, schema);
linkContentHandler = new LinkContentHandler();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler, metadata, parseContext);
// Expect anchor text
assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class HtmlParserTest method testParseEmpty.
@Test
public void testParseEmpty() throws Exception {
ContentHandler handler = new BodyContentHandler();
new HtmlParser().parse(new ByteArrayInputStream(new byte[0]), handler, new Metadata(), new ParseContext());
assertEquals("", handler.toString());
}
Aggregations