use of org.jsoup.parser.Parser in project flow by vaadin.
the class TemplateParser method parse.
/**
* Parses the template from the given input stream to a tree of template
* nodes.
*
* @param templateStream
* the input stream containing the template to parse, not
* <code>null</code>
* @param templateResolver
* the resolver to use to look up included files
* @return the template node at the root of the parsed template tree
*/
public static TemplateNode parse(InputStream templateStream, TemplateResolver templateResolver) {
assert templateStream != null;
String templateString = convertStreamToString(templateStream);
Parser parser = Parser.htmlParser();
// tag, attribute preserve case
parser.settings(new ParseSettings(true, true));
Document document = parser.parseInput(templateString, "");
return parse(document, templateResolver);
}
use of org.jsoup.parser.Parser in project jsoup by jhy.
the class ParseTest method testXwikiExpanded.
@Test
public void testXwikiExpanded() throws IOException {
// https://github.com/jhy/jsoup/issues/1324
// this tests that if there is a huge illegal character reference, we can get through a buffer and rewind, and still catch that it's an invalid refence,
// and the parse tree is correct.
File in = getFile("/htmltests/xwiki-edit.html.gz");
Parser parser = Parser.htmlParser();
Document doc = Jsoup.parse(new GZIPInputStream(new FileInputStream(in)), "UTF-8", "https://localhost/", parser.setTrackErrors(100));
ParseErrorList errors = parser.getErrors();
assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
// not an invalid reference because did not look legit
assertEquals(0, errors.size());
// was getting busted at =userdirectory, because it hit the bufferup point but the mark was then lost. so
// updated to preserve the mark.
String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&RIGHTHERERIGHTHERERIGHTHERERIGHTHERE";
assertTrue(doc.select("[data-id=userdirectory]").outerHtml().startsWith(wantHtml));
}
use of org.jsoup.parser.Parser in project jsoup by jhy.
the class DocumentTest method accessorsAreCaseInsensitive.
@Test
public void accessorsAreCaseInsensitive() {
Parser parser = Parser.htmlParser().settings(ParseSettings.preserveCase);
Document doc = parser.parseInput("<!DOCTYPE html><HTML><HEAD><TITLE>SHOUTY</TITLE></HEAD><BODY>HELLO</BODY></HTML>", "");
Element body = doc.body();
assertEquals("BODY", body.tagName());
assertEquals("body", body.normalName());
Element head = doc.head();
assertEquals("HEAD", head.tagName());
assertEquals("body", body.normalName());
Element root = doc.selectFirst("html");
assertEquals("HTML", root.tagName());
assertEquals("html", root.normalName());
assertEquals("SHOUTY", doc.title());
}
use of org.jsoup.parser.Parser in project LibreraReader by foobnix.
the class EpubExtractor method getFooterNotes.
@Override
public Map<String, String> getFooterNotes(String inputPath) {
Map<String, String> notes = new HashMap<String, String>();
try {
InputStream in = new FileInputStream(new File(inputPath));
ZipInputStream zipInputStream = new ZipInputStream(in);
ZipEntry nextEntry = null;
Map<String, String> textLink = new HashMap<String, String>();
Set<String> files = new HashSet<String>();
try {
CacheZipUtils.removeFiles(CacheZipUtils.ATTACHMENTS_CACHE_DIR.listFiles());
while ((nextEntry = zipInputStream.getNextEntry()) != null) {
if (TempHolder.get().loadingCancelled) {
break;
}
String name = nextEntry.getName();
String nameLow = name.toLowerCase();
if (nameLow.endsWith("html") || nameLow.endsWith("htm") || nameLow.endsWith("xml")) {
// System.out.println("- " + nameLow + " -");
Document parse = Jsoup.parse(zipInputStream, null, "", Parser.xmlParser());
Elements select = parse.select("a[href]");
for (int i = 0; i < select.size(); i++) {
Element item = select.get(i);
String text = item.text();
if (item.attr("href").contains("#")) {
String attr = item.attr("href");
String file = attr.substring(0, attr.indexOf("#"));
// file);
if (attr.startsWith("#")) {
attr = name + attr;
}
if (!TxtUtils.isFooterNote(text)) {
LOG.d("Skip text", text);
continue;
}
textLink.put(attr, text);
LOG.d("Extract file", file);
if (TxtUtils.isEmpty(file)) {
file = name;
}
if (file.endsWith("html") || file.endsWith("htm") || nameLow.endsWith("xml")) {
files.add(file);
}
}
}
}
zipInputStream.closeEntry();
}
in = new FileInputStream(new File(inputPath));
zipInputStream = new ZipInputStream(in);
while ((nextEntry = zipInputStream.getNextEntry()) != null) {
if (TempHolder.get().loadingCancelled) {
break;
}
String name = nextEntry.getName();
for (String fileName : files) {
if (name.endsWith(fileName)) {
LOG.d("PARSE FILE NAME", name);
// System.out.println("file: " + name);
Parser xmlParser = Parser.xmlParser();
Document parse = Jsoup.parse(zipInputStream, null, "", xmlParser);
Elements ids = parse.select("[id]");
for (int i = 0; i < ids.size(); i++) {
Element item = ids.get(i);
String id = item.attr("id");
String value = item.text();
if (value.trim().length() < 4) {
value = value + " " + parse.select("[id=" + id + "]+*").text();
}
if (value.trim().length() < 4) {
value = value + " " + parse.select("[id=" + id + "]+*+*").text();
}
try {
if (value.trim().length() < 4) {
value = value + " " + parse.select("[id=" + id + "]").parents().get(0).text();
}
} catch (Exception e) {
LOG.e(e);
}
// System.out.println("id:" + id + " value:"
// +
// value);
String fileKey = fileName + "#" + id;
String textKey = textLink.get(fileKey);
LOG.d(textKey + " " + value);
notes.put(textKey, value);
}
}
}
zipInputStream.closeEntry();
}
zipInputStream.close();
in.close();
} catch (Exception e) {
LOG.e(e);
}
return notes;
} catch (Throwable e) {
LOG.e(e);
return notes;
}
}
Aggregations