use of primal.primitive.adt.IntRange in project suite by stupidsing.
the class ScrapeHtml method parse.
public HtmlNode parse(String in) {
var pairs = new ArrayList<IntRange>();
int pos0, posx = 0;
nextTag: while (0 <= (pos0 = in.indexOf("<", posx))) if ((posx = pos0 + 1) < in.length() && !Is.whitespace(in.charAt(posx)))
if (0 <= (posx = in.indexOf(">", posx))) {
pairs.add(IntRange.of(pos0, ++posx));
if (in.startsWith("<![CDATA[", pos0)) {
posx = in.indexOf("]]>", pos0 + 9);
continue nextTag;
}
for (var rawTextTag : List.of("script", "style", "textarea", "title")) if (in.startsWith(rawTextTag, pos0 + 1)) {
posx = in.indexOf("</" + rawTextTag, posx);
continue nextTag;
}
} else
break;
Fun<String, IntObjPair<String>> getNameFun = tag -> {
int p1 = 1, px = tag.length() - 1;
var first = tag.charAt(p1);
var last = tag.charAt(px - 1);
int d;
if (first == '!')
return IntObjPair.of(0, null);
else {
if (first == '/') {
p1++;
d = -1;
} else if (last == '/') {
px--;
d = 0;
} else
d = 1;
var ps = 0;
while (ps < px && !Is.whitespace(tag.charAt(ps))) ps++;
var name = tag.substring(p1, ps);
return IntObjPair.of(d, name);
}
};
var deque = new ArrayDeque<>(List.of(new HtmlNode(null, "", 0, 0)));
IntIntSink addTextFun = (prevp, p0) -> {
if (prevp != p0) {
var s = htmlUtil.decode(in.substring(prevp, p0)).trim();
if (!s.isEmpty())
deque.element().children.add(new HtmlNode(null, s, prevp, p0));
}
};
var prevp = 0;
for (var pair : pairs) {
var htmlNode = deque.element();
var p0 = pair.s;
var px = pair.e;
addTextFun.sink2(prevp, p0);
var tag = in.substring(p0, px);
prevp = getNameFun.apply(tag).map((d, name) -> {
if (d == -1) {
// closing tag
HtmlNode hn;
while (!deque.isEmpty()) if (Equals.string(getNameFun.apply((hn = deque.pop()).tag).v, name)) {
hn.p2 = p0;
hn.px = px;
break;
}
} else {
// opening tag
var htmlNode1 = new HtmlNode(name, tag, p0, px);
htmlNode.children.add(htmlNode1);
if (d == 1)
deque.push(htmlNode1);
}
return px;
});
}
addTextFun.sink2(prevp, in.length());
return deque.pop();
}
Aggregations