Search in sources :

Example 1 with IntRange

use of primal.primitive.adt.IntRange in project suite by stupidsing.

the class ScrapeHtml method parse.

public HtmlNode parse(String in) {
    var pairs = new ArrayList<IntRange>();
    int pos0, posx = 0;
    nextTag: while (0 <= (pos0 = in.indexOf("<", posx))) if ((posx = pos0 + 1) < in.length() && !Is.whitespace(in.charAt(posx)))
        if (0 <= (posx = in.indexOf(">", posx))) {
            pairs.add(IntRange.of(pos0, ++posx));
            if (in.startsWith("<![CDATA[", pos0)) {
                posx = in.indexOf("]]>", pos0 + 9);
                continue nextTag;
            }
            for (var rawTextTag : List.of("script", "style", "textarea", "title")) if (in.startsWith(rawTextTag, pos0 + 1)) {
                posx = in.indexOf("</" + rawTextTag, posx);
                continue nextTag;
            }
        } else
            break;
    Fun<String, IntObjPair<String>> getNameFun = tag -> {
        int p1 = 1, px = tag.length() - 1;
        var first = tag.charAt(p1);
        var last = tag.charAt(px - 1);
        int d;
        if (first == '!')
            return IntObjPair.of(0, null);
        else {
            if (first == '/') {
                p1++;
                d = -1;
            } else if (last == '/') {
                px--;
                d = 0;
            } else
                d = 1;
            var ps = 0;
            while (ps < px && !Is.whitespace(tag.charAt(ps))) ps++;
            var name = tag.substring(p1, ps);
            return IntObjPair.of(d, name);
        }
    };
    var deque = new ArrayDeque<>(List.of(new HtmlNode(null, "", 0, 0)));
    IntIntSink addTextFun = (prevp, p0) -> {
        if (prevp != p0) {
            var s = htmlUtil.decode(in.substring(prevp, p0)).trim();
            if (!s.isEmpty())
                deque.element().children.add(new HtmlNode(null, s, prevp, p0));
        }
    };
    var prevp = 0;
    for (var pair : pairs) {
        var htmlNode = deque.element();
        var p0 = pair.s;
        var px = pair.e;
        addTextFun.sink2(prevp, p0);
        var tag = in.substring(p0, px);
        prevp = getNameFun.apply(tag).map((d, name) -> {
            if (d == -1) {
                // closing tag
                HtmlNode hn;
                while (!deque.isEmpty()) if (Equals.string(getNameFun.apply((hn = deque.pop()).tag).v, name)) {
                    hn.p2 = p0;
                    hn.px = px;
                    break;
                }
            } else {
                // opening tag
                var htmlNode1 = new HtmlNode(name, tag, p0, px);
                htmlNode.children.add(htmlNode1);
                if (d == 1)
                    deque.push(htmlNode1);
            }
            return px;
        });
    }
    addTextFun.sink2(prevp, in.length());
    return deque.pop();
}
Also used : Pair(primal.adt.Pair) PerMap(primal.persistent.PerMap) IntRange(primal.primitive.adt.IntRange) Build(primal.Verbs.Build) IntIntSink(primal.primitive.IntIntSink) Predicate(java.util.function.Predicate) Fun(primal.fp.Funs.Fun) Set(java.util.Set) Assoc(primal.parser.Operator.Assoc) Read(primal.MoreVerbs.Read) Is(primal.Verbs.Is) ArrayList(java.util.ArrayList) Streamlet(primal.streamlet.Streamlet) List(java.util.List) Split(primal.MoreVerbs.Split) Substring(primal.Verbs.Substring) ArrayDeque(java.util.ArrayDeque) Equals(primal.Verbs.Equals) IntObjPair(primal.primitive.adt.pair.IntObjPair) IntIntSink(primal.primitive.IntIntSink) IntObjPair(primal.primitive.adt.pair.IntObjPair) ArrayList(java.util.ArrayList) ArrayDeque(java.util.ArrayDeque)

Aggregations

ArrayDeque (java.util.ArrayDeque)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Set (java.util.Set)1 Predicate (java.util.function.Predicate)1 Read (primal.MoreVerbs.Read)1 Split (primal.MoreVerbs.Split)1 Build (primal.Verbs.Build)1 Equals (primal.Verbs.Equals)1 Is (primal.Verbs.Is)1 Substring (primal.Verbs.Substring)1 Pair (primal.adt.Pair)1 Fun (primal.fp.Funs.Fun)1 Assoc (primal.parser.Operator.Assoc)1 PerMap (primal.persistent.PerMap)1 IntIntSink (primal.primitive.IntIntSink)1 IntRange (primal.primitive.adt.IntRange)1 IntObjPair (primal.primitive.adt.pair.IntObjPair)1 Streamlet (primal.streamlet.Streamlet)1