Search in sources :

Example 16 with Node

use of org.jsoup.nodes.Node in project Gargoyle by callakrsos.

the class ArticleExtractorComposite method updateNewRealContent.

private void updateNewRealContent(Collection<String> links) {
    URLModel[] array = links.parallelStream().map(link -> {
        URLModel model = URLModel.empty();
        try {
            BiFunction<InputStream, Charset, URLModel> response = new BiFunction<InputStream, Charset, URLModel>() {

                @Override
                public URLModel apply(InputStream is, Charset charset) {
                    URLModel urlModel = URLModel.empty();
                    try {
                        byte[] byteArray = IOUtils.toByteArray(is);
                        String content = ValueUtil.toString(byteArray, charset);
                        if (content == null)
                            return URLModel.empty();
                        Document parse = Jsoup.parse(content, "http");
                        Element head = parse.head();
                        Elements title = head.getElementsByTag("title");
                        List<Node> childNodes = head.childNodes();
                        for (Node n : childNodes) {
                            if ("meta".equals(n.nodeName())) {
                                String attr = n.attr("content");
                                if (ValueUtil.isNotEmpty(attr)) {
                                    String[] split = attr.split(";");
                                    if (split != null && split.length == 2) {
                                        String[] split2 = split[1].split("=");
                                        if (split2 != null && split2.length == 2) {
                                            Charset forName = Charset.forName(split2[1]);
                                            if (!charset.equals(forName)) {
                                                content = new String(byteArray, forName);
                                                break;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                        urlModel = new URLModel(link, content);
                        urlModel.setTitle(title.text());
                    } catch (IOException e) {
                        e.printStackTrace();
                    } finally {
                        try {
                            is.close();
                        } catch (Exception e) {
                            LOGGER.error(ValueUtil.toString(e));
                        }
                    }
                    return urlModel;
                }
            };
            model = RequestUtil.req200(new URL(link), response, false);
        } catch (Exception e) {
            return URLModel.empty();
        }
        return model;
    }).filter(v -> !v.isEmpty()).map(v -> {
        URLModel model = URLModel.empty();
        String url = v.getUrl();
        String content = v.getContent();
        ExtractorBase instance = null;
        if (url.contains("twitter.com")) {
            instance = KeepEverythingExtractor.INSTANCE;
        } else {
            instance = ArticleExtractor.getInstance();
        }
        InputSource source = new InputSource(new StringReader(content));
        source.setEncoding("UTF-8");
        try {
            content = ValueUtil.HTML.getNewsContent(instance, source);
            model.setUrl(v.getUrl());
            model.setContent(content);
            model.setTitle(v.getTitle());
        } catch (Exception e) {
            model = URLModel.empty();
            e.printStackTrace();
        }
        return model;
    }).filter(v -> !v.isEmpty()).toArray(URLModel[]::new);
    cbSmmy.getItems().clear();
    cbSmmy.getItems().addAll(array);
    ValueUtil.toTF_IDF(array).stream().map(mapper).reduce(accumulator).ifPresent(txtTfIdf::setText);
    cbSmmy.getSelectionModel().select(0);
}
Also used : URL(java.net.URL) ListCell(javafx.scene.control.ListCell) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) BoilerpipeSAXInput(com.kohlschutter.boilerpipe.sax.BoilerpipeSAXInput) KeyValue(com.kyj.fx.voeditor.visual.framework.KeyValue) ExtractorBase(com.kohlschutter.boilerpipe.extractors.ExtractorBase) RealtimeSearchItemVO(com.kyj.fx.voeditor.visual.framework.RealtimeSearchItemVO) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) Element(org.jsoup.nodes.Element) IOUtil(com.kyj.fx.voeditor.visual.util.IOUtil) JFXComboBox(com.jfoenix.controls.JFXComboBox) TextField(javafx.scene.control.TextField) ResponseHandler(com.kyj.fx.voeditor.visual.util.ResponseHandler) Collection(java.util.Collection) SingleSelectionModel(javafx.scene.control.SingleSelectionModel) FXMLController(com.kyj.fx.voeditor.visual.framework.annotation.FXMLController) Set(java.util.Set) Collectors(java.util.stream.Collectors) BinaryOperator(java.util.function.BinaryOperator) Platform(javafx.application.Platform) FXML(javafx.fxml.FXML) IOUtils(org.apache.commons.io.IOUtils) Node(org.jsoup.nodes.Node) FxUtil(com.kyj.fx.voeditor.visual.util.FxUtil) List(java.util.List) Document(org.jsoup.nodes.Document) Optional(java.util.Optional) Jsoup(org.jsoup.Jsoup) Elements(org.jsoup.select.Elements) BorderPane(javafx.scene.layout.BorderPane) ListView(javafx.scene.control.ListView) TextFieldListCell(javafx.scene.control.cell.TextFieldListCell) RequestUtil(com.kyj.fx.voeditor.visual.util.RequestUtil) Function(java.util.function.Function) ArrayList(java.util.ArrayList) URLModel(com.kyj.fx.voeditor.visual.framework.URLModel) Charset(java.nio.charset.Charset) Callback(javafx.util.Callback) JFXTextArea(com.jfoenix.controls.JFXTextArea) InputSource(org.xml.sax.InputSource) WebView(javafx.scene.web.WebView) ObjectProperty(javafx.beans.property.ObjectProperty) Logger(org.slf4j.Logger) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ValueUtil(com.kyj.fx.voeditor.visual.util.ValueUtil) StringConverter(javafx.util.StringConverter) KeepEverythingExtractor(com.kohlschutter.boilerpipe.extractors.KeepEverythingExtractor) StringReader(java.io.StringReader) SimpleObjectProperty(javafx.beans.property.SimpleObjectProperty) Collections(java.util.Collections) TextDocument(com.kohlschutter.boilerpipe.document.TextDocument) InputStream(java.io.InputStream) ArticleExtractor(com.kohlschutter.boilerpipe.extractors.ArticleExtractor) InputSource(org.xml.sax.InputSource) ExtractorBase(com.kohlschutter.boilerpipe.extractors.ExtractorBase) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Element(org.jsoup.nodes.Element) Node(org.jsoup.nodes.Node) Charset(java.nio.charset.Charset) IOException(java.io.IOException) Document(org.jsoup.nodes.Document) TextDocument(com.kohlschutter.boilerpipe.document.TextDocument) Elements(org.jsoup.select.Elements) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) URL(java.net.URL) BiFunction(java.util.function.BiFunction) StringReader(java.io.StringReader) URLModel(com.kyj.fx.voeditor.visual.framework.URLModel)

Aggregations

Node (org.jsoup.nodes.Node)16 TextNode (org.jsoup.nodes.TextNode)10 Element (org.jsoup.nodes.Element)9 Document (org.jsoup.nodes.Document)4 ArrayList (java.util.ArrayList)3 JsonObject (com.google.gson.JsonObject)2 LinkedList (java.util.LinkedList)2 Elements (org.jsoup.select.Elements)2 JsonElement (com.google.gson.JsonElement)1 JFXComboBox (com.jfoenix.controls.JFXComboBox)1 JFXTextArea (com.jfoenix.controls.JFXTextArea)1 TextDocument (com.kohlschutter.boilerpipe.document.TextDocument)1 ArticleExtractor (com.kohlschutter.boilerpipe.extractors.ArticleExtractor)1 ExtractorBase (com.kohlschutter.boilerpipe.extractors.ExtractorBase)1 KeepEverythingExtractor (com.kohlschutter.boilerpipe.extractors.KeepEverythingExtractor)1 BoilerpipeSAXInput (com.kohlschutter.boilerpipe.sax.BoilerpipeSAXInput)1 KeyValue (com.kyj.fx.voeditor.visual.framework.KeyValue)1 RealtimeSearchItemVO (com.kyj.fx.voeditor.visual.framework.RealtimeSearchItemVO)1 URLModel (com.kyj.fx.voeditor.visual.framework.URLModel)1 FXMLController (com.kyj.fx.voeditor.visual.framework.annotation.FXMLController)1