use of org.apache.tika.parser.ParseContext in project tika by apache.
the class ImageParserTest method testGIF.
@Test
public void testGIF() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "image/gif");
InputStream stream = getClass().getResourceAsStream("/test-documents/testGIF.gif");
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
assertEquals("75", metadata.get("height"));
assertEquals("100", metadata.get("width"));
assertEquals("true", metadata.get("Compression Lossless"));
assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
assertEquals("lzw", metadata.get("Compression CompressionTypeName"));
assertEquals("0", metadata.get("Dimension HorizontalPixelOffset"));
assertEquals("imageLeftPosition=0, imageTopPosition=0, imageWidth=100, imageHeight=75, interlaceFlag=false", metadata.get("ImageDescriptor"));
assertEquals("Index", metadata.get("Data SampleFormat"));
assertEquals("3", metadata.get("Chroma NumChannels"));
assertEquals("1", metadata.get("Compression NumProgressiveScans"));
assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("CommentExtensions CommentExtension"));
assertEquals("value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
assertEquals("true", metadata.get("Chroma BlackIsZero"));
assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
assertEquals("image/gif", metadata.get("Content-Type"));
assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(TikaCoreProperties.COMMENTS));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class IWorkParserTest method setUp.
@Before
public void setUp() {
iWorkParser = new IWorkPackageParser();
parseContext = new ParseContext();
parseContext.set(Parser.class, new AutoDetectParser());
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class HtmlParserTest method assertScriptLink.
private void assertScriptLink(String html, String url) throws Exception {
// IdentityHtmlMapper is needed to extract <script> tags
ParseContext context = new ParseContext();
context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "text/html");
final List<String> links = new ArrayList<String>();
new HtmlParser().parse(new ByteArrayInputStream(html.getBytes(UTF_8)), new DefaultHandler() {
@Override
public void startElement(String u, String l, String name, Attributes atts) {
if (name.equals("script") && atts.getValue("", "src") != null) {
links.add(atts.getValue("", "src"));
}
}
}, metadata, context);
assertEquals(1, links.size());
assertEquals(url, links.get(0));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class HtmlParserTest method testHtml5Charset.
/**
* Test case for TIKA-892
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a>
*/
@Test
public void testHtml5Charset() throws Exception {
String test = "<html><head><meta charset=\"ISO-8859-15\" />" + "<title>the name is ándre</title>" + "</head><body></body></html>";
Metadata metadata = new Metadata();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
}
use of org.apache.tika.parser.ParseContext in project tika by apache.
the class HtmlParserTest method testFrameSrcExtraction.
/**
* Test case for TIKA-463. Don't skip elements that have URLs.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
*/
@Test
public void testFrameSrcExtraction() throws Exception {
final String test = "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" + "</head><frameset><frame src=\"frame.html\" /></frameset></html>";
StringWriter sw = new StringWriter();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
// <frame> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
}
Aggregations