use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestContent method testGetContentType.
/**
* Unit tests for getContentType(String, String, byte[]) method.
*/
@Test
public void testGetContentType() throws Exception {
Content c = null;
Metadata p = new Metadata();
c = new Content("http://www.foo.com/", "http://www.foo.com/", "".getBytes("UTF8"), "text/html; charset=UTF-8", p, conf);
Assert.assertEquals("text/html", c.getContentType());
c = new Content("http://www.foo.com/foo.html", "http://www.foo.com/", "".getBytes("UTF8"), "", p, conf);
Assert.assertEquals("text/html", c.getContentType());
c = new Content("http://www.foo.com/foo.html", "http://www.foo.com/", "".getBytes("UTF8"), null, p, conf);
Assert.assertEquals("text/html", c.getContentType());
c = new Content("http://www.foo.com/", "http://www.foo.com/", "<html></html>".getBytes("UTF8"), "", p, conf);
Assert.assertEquals("text/html", c.getContentType());
c = new Content("http://www.foo.com/foo.html", "http://www.foo.com/", "<html></html>".getBytes("UTF8"), "text/plain", p, conf);
Assert.assertEquals("text/html", c.getContentType());
c = new Content("http://www.foo.com/foo.png", "http://www.foo.com/", "<html></html>".getBytes("UTF8"), "text/plain", p, conf);
Assert.assertEquals("text/html", c.getContentType());
c = new Content("http://www.foo.com/", "http://www.foo.com/", "".getBytes("UTF8"), "", p, conf);
Assert.assertEquals(MimeTypes.OCTET_STREAM, c.getContentType());
c = new Content("http://www.foo.com/", "http://www.foo.com/", "".getBytes("UTF8"), null, p, conf);
Assert.assertNotNull(c.getContentType());
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestEncodingDetector method testGuessing.
@Test
public void testGuessing() {
// first disable auto detection
conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);
Metadata metadata = new Metadata();
EncodingDetector detector;
Content content;
String encoding;
content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
encoding = detector.guessEncoding(content, "windows-1252");
// no information is available, so it should return default encoding
Assert.assertEquals("windows-1252", encoding.toLowerCase());
metadata.clear();
metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
encoding = detector.guessEncoding(content, "windows-1252");
Assert.assertEquals("utf-16", encoding.toLowerCase());
metadata.clear();
content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue("windows-1254", "sniffed");
encoding = detector.guessEncoding(content, "windows-1252");
Assert.assertEquals("windows-1254", encoding.toLowerCase());
// enable autodetection
conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
metadata.clear();
metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf);
detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue("utf-32", "sniffed");
encoding = detector.guessEncoding(content, "windows-1252");
Assert.assertEquals("utf-8", encoding.toLowerCase());
}
Aggregations