use of org.codelibs.fess.crawler.entity.SitemapSet in project fess-crawler by codelibs.
the class SitemapsHelperTest method test_parseXmlSitemapsGz.
public void test_parseXmlSitemapsGz() {
final InputStream in = ResourceUtil.getResourceAsStream("sitemaps/sitemap1.xml.gz");
final SitemapSet sitemapSet = sitemapsHelper.parse(in);
final Sitemap[] sitemaps = sitemapSet.getSitemaps();
assertEquals(5, sitemaps.length);
assertTrue(sitemapSet.isUrlSet());
assertFalse(sitemapSet.isIndex());
assertEquals("2005-01-01", sitemaps[0].getLastmod());
assertEquals("http://www.example.com/", sitemaps[0].getLoc());
assertEquals("monthly", ((SitemapUrl) sitemaps[0]).getChangefreq());
assertEquals("0.8", ((SitemapUrl) sitemaps[0]).getPriority());
assertNull(sitemaps[1].getLastmod());
assertEquals("http://www.example.com/catalog?item=12&desc=vacation_hawaii", sitemaps[1].getLoc());
assertEquals("weekly", ((SitemapUrl) sitemaps[1]).getChangefreq());
assertNull(((SitemapUrl) sitemaps[1]).getPriority());
assertEquals("2004-12-23", sitemaps[2].getLastmod());
assertEquals("http://www.example.com/catalog?item=73&desc=vacation_new_zealand", sitemaps[2].getLoc());
assertEquals("weekly", ((SitemapUrl) sitemaps[2]).getChangefreq());
assertNull(((SitemapUrl) sitemaps[2]).getPriority());
assertEquals("2004-12-23T18:00:15+00:00", sitemaps[3].getLastmod());
assertEquals("http://www.example.com/catalog?item=74&desc=vacation_newfoundland", sitemaps[3].getLoc());
assertNull(((SitemapUrl) sitemaps[3]).getChangefreq());
assertEquals("0.3", ((SitemapUrl) sitemaps[3]).getPriority());
assertEquals("2004-11-23", sitemaps[4].getLastmod());
assertEquals("http://www.example.com/catalog?item=83&desc=vacation_usa", sitemaps[4].getLoc());
assertNull(((SitemapUrl) sitemaps[4]).getChangefreq());
assertNull(((SitemapUrl) sitemaps[4]).getPriority());
}
use of org.codelibs.fess.crawler.entity.SitemapSet in project fess-crawler by codelibs.
the class SitemapsHelperTest method test_parseXmlSitemaps.
public void test_parseXmlSitemaps() {
final InputStream in = ResourceUtil.getResourceAsStream("sitemaps/sitemap1.xml");
final SitemapSet sitemapSet = sitemapsHelper.parse(in);
final Sitemap[] sitemaps = sitemapSet.getSitemaps();
assertEquals(5, sitemaps.length);
assertTrue(sitemapSet.isUrlSet());
assertFalse(sitemapSet.isIndex());
assertEquals("2005-01-01", sitemaps[0].getLastmod());
assertEquals("http://www.example.com/", sitemaps[0].getLoc());
assertEquals("monthly", ((SitemapUrl) sitemaps[0]).getChangefreq());
assertEquals("0.8", ((SitemapUrl) sitemaps[0]).getPriority());
assertNull(sitemaps[1].getLastmod());
assertEquals("http://www.example.com/catalog?item=12&desc=vacation_hawaii", sitemaps[1].getLoc());
assertEquals("weekly", ((SitemapUrl) sitemaps[1]).getChangefreq());
assertNull(((SitemapUrl) sitemaps[1]).getPriority());
assertEquals("2004-12-23", sitemaps[2].getLastmod());
assertEquals("http://www.example.com/catalog?item=73&desc=vacation_new_zealand", sitemaps[2].getLoc());
assertEquals("weekly", ((SitemapUrl) sitemaps[2]).getChangefreq());
assertNull(((SitemapUrl) sitemaps[2]).getPriority());
assertEquals("2004-12-23T18:00:15+00:00", sitemaps[3].getLastmod());
assertEquals("http://www.example.com/catalog?item=74&desc=vacation_newfoundland", sitemaps[3].getLoc());
assertNull(((SitemapUrl) sitemaps[3]).getChangefreq());
assertEquals("0.3", ((SitemapUrl) sitemaps[3]).getPriority());
assertEquals("2004-11-23", sitemaps[4].getLastmod());
assertEquals("http://www.example.com/catalog?item=83&desc=vacation_usa", sitemaps[4].getLoc());
assertNull(((SitemapUrl) sitemaps[4]).getChangefreq());
assertNull(((SitemapUrl) sitemaps[4]).getPriority());
}
use of org.codelibs.fess.crawler.entity.SitemapSet in project fess-crawler by codelibs.
the class SitemapsHelper method parseTextSitemaps.
protected SitemapSet parseTextSitemaps(final InputStream in) {
final SitemapSet sitemapSet = new SitemapSet();
sitemapSet.setType(SitemapSet.URLSET);
try {
final BufferedReader br = new BufferedReader(new InputStreamReader(in, Constants.UTF_8));
String line;
while ((line = br.readLine()) != null) {
final String url = line.trim();
if (StringUtil.isNotBlank(url) && (url.startsWith("http://") || url.startsWith("https://"))) {
final SitemapUrl sitemapUrl = new SitemapUrl();
sitemapUrl.setLoc(url);
sitemapSet.addSitemap(sitemapUrl);
}
}
return sitemapSet;
} catch (final Exception e) {
throw new SitemapsException("Could not parse Text Sitemaps.", e);
}
}
use of org.codelibs.fess.crawler.entity.SitemapSet in project fess-crawler by codelibs.
the class SitemapsResponseProcessor method process.
@Override
public void process(final ResponseData responseData) {
final SitemapsHelper sitemapsHelper = crawlerContainer.getComponent("sitemapsHelper");
try (final InputStream responseBody = responseData.getResponseBody()) {
final SitemapSet sitemapSet = sitemapsHelper.parse(responseBody);
final Set<RequestData> requestDataSet = new LinkedHashSet<>();
for (final Sitemap sitemap : sitemapSet.getSitemaps()) {
if (sitemap != null) {
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(sitemap.getLoc()).build());
}
}
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#process");
} catch (final IOException e) {
throw new IORuntimeException(e);
}
}
use of org.codelibs.fess.crawler.entity.SitemapSet in project fess-crawler by codelibs.
the class SitemapsHelperTest method test_parseXmlSitemapsIndex.
public void test_parseXmlSitemapsIndex() {
final InputStream in = ResourceUtil.getResourceAsStream("sitemaps/sitemap2.xml");
final SitemapSet sitemapSet = sitemapsHelper.parse(in);
final Sitemap[] sitemaps = sitemapSet.getSitemaps();
assertEquals(2, sitemaps.length);
assertFalse(sitemapSet.isUrlSet());
assertTrue(sitemapSet.isIndex());
assertEquals("2004-10-01T18:23:17+00:00", sitemaps[0].getLastmod());
assertEquals("http://www.example.com/sitemap1.xml.gz", sitemaps[0].getLoc());
assertEquals("2005-01-01", sitemaps[1].getLastmod());
assertEquals("http://www.example.com/sitemap2.xml.gz", sitemaps[1].getLoc());
}
Aggregations