use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class TXTParserTest method testUseIncomingCharsetAsHint.
/**
* Test case for TIKA-335: using incoming charset
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
*/
@Test
public void testUseIncomingCharsetAsHint() throws Exception {
// Could be ISO 8859-1 or ISO 8859-15 or ...
// u00e1 is latin small letter a with acute
final String test2 = "the name is ándre";
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
// deprecated
assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
// deprecated
assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class TXTParserTest method assertExtractText.
private void assertExtractText(String msg, String expected, byte[] input) throws Exception {
ContentHandler handler = new BodyContentHandler() {
public void ignorableWhitespace(char[] ch, int off, int len) {
// Ignore the whitespace added by XHTMLContentHandler
}
};
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
assertEquals(msg, expected, handler.toString());
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class SolidworksParserTest method testPart2014SP0Parser.
/**
* Test the parsing of an solidWorks part in version 2014SP0
*/
@Test
public void testPart2014SP0Parser() throws Exception {
try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksPart2014SP0.SLDPRT")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
//Check content type
assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
//Check properties
assertEquals("2012-04-18T10:27:29Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
assertEquals("2013-11-28T12:38:28Z", metadata.get(Metadata.MODIFIED));
assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
assertEquals("", metadata.get(TikaCoreProperties.TITLE));
assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class SolidworksParserTest method testPart2013SP2Parser.
/**
* Test the parsing of an solidWorks part in version 2013SP2
*/
@Test
public void testPart2013SP2Parser() throws Exception {
try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksPart2013SP2.SLDPRT")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
//Check content type
assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
//Check properties
assertEquals("2012-04-18T10:27:29Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
assertEquals("2013-09-06T08:12:12Z", metadata.get(Metadata.MODIFIED));
assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
assertEquals("", metadata.get(TikaCoreProperties.TITLE));
assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
}
}
use of org.apache.tika.sax.BodyContentHandler in project tika by apache.
the class SolidworksParserTest method testDrawing2013SP2Parser.
/*
* Test the parsing of an solidWorks drawing in version 2013SP2
*/
@Test
public void testDrawing2013SP2Parser() throws Exception {
try (InputStream input = SolidworksParserTest.class.getResourceAsStream("/test-documents/testsolidworksDrawing2013SP2.SLDDRW")) {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
new OfficeParser().parse(input, handler, metadata, new ParseContext());
//Check content type
assertEquals("application/sldworks", metadata.get(Metadata.CONTENT_TYPE));
//Check properties
assertEquals("2012-07-03T12:05:29Z", metadata.get(TikaCoreProperties.CREATED));
assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
assertEquals("2013-09-06T08:06:57Z", metadata.get(Metadata.MODIFIED));
assertEquals("solidworks-dcom_dev", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
assertEquals("", metadata.get(TikaCoreProperties.TITLE));
assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
}
}
Aggregations