Examples with ExtractorBase - com.kohlschutter.boilerpipe.extractors.ExtractorBase

Example 1 with ExtractorBase

use of com.kohlschutter.boilerpipe.extractors.ExtractorBase in project Gargoyle by callakrsos.

the class ArticleExtractorComposite method request.

public void request(String url) {
    SingleSelectionModel<Class<? extends ExtractorBase>> selectionModel = cbAlgorisms.getSelectionModel();
    Class<? extends ExtractorBase> selectAlgorism = selectionModel.getSelectedItem();
    if (selectAlgorism != null) {
        RealtimeSearchItemVO vo = new RealtimeSearchItemVO();
        vo.setLink(url);
        request(selectAlgorism, vo);
    }
}

Also used : ExtractorBase(com.kohlschutter.boilerpipe.extractors.ExtractorBase) RealtimeSearchItemVO(com.kyj.fx.voeditor.visual.framework.RealtimeSearchItemVO)

Example 2 with ExtractorBase

use of com.kohlschutter.boilerpipe.extractors.ExtractorBase in project Gargoyle by callakrsos.

the class ArticleExtractorComposite method initialize.

@FXML
public void initialize() {
    cbAlgorisms.getItems().addAll(ValueUtil.HTML.getAvaliablesExtractorBase());
    cbAlgorisms.getSelectionModel().select(ArticleExtractor.class);
    cbAlgorisms.setCellFactory(new Callback<ListView<Class<? extends ExtractorBase>>, ListCell<Class<? extends ExtractorBase>>>() {

        @Override
        public ListCell<Class<? extends ExtractorBase>> call(ListView<Class<? extends ExtractorBase>> param) {
            return new TextFieldListCell<>(new StringConverter<Class<? extends ExtractorBase>>() {

                @Override
                public String toString(Class<? extends ExtractorBase> object) {
                    return object.getSimpleName();
                }

                @Override
                public Class<? extends ExtractorBase> fromString(String string) {
                    // TODO Auto-generated method stub
                    return null;
                }
            });
        }
    });
    cbAlgorisms.setConverter(new StringConverter<Class<? extends ExtractorBase>>() {

        @Override
        public String toString(Class<? extends ExtractorBase> object) {
            return object.getSimpleName();
        }

        @Override
        public Class<? extends ExtractorBase> fromString(String string) {
            // TODO Auto-generated method stub
            return null;
        }
    });
    cbAlgorisms.valueProperty().addListener((oba, o, n) -> {
        Class<? extends ExtractorBase> algorism = n;
        // webPreview.getEngine().getDocument().getBaseURI();
        String baseURI = txtUrl.getText();
        if (ValueUtil.isEmpty(baseURI))
            return;
        RealtimeSearchItemVO vo = new RealtimeSearchItemVO();
        vo.setLink(baseURI);
        try {
            URLModel htmlContent = getHTMLContent(vo);
            if (!htmlContent.isEmpty()) {
                String boilderPipe = boilderPipe(algorism, htmlContent.getContent());
                txtResult.setText(boilderPipe);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    });
    cbSmmy.valueProperty().addListener((oba, o, n) -> {
        txtSummary.setText(n.getUrl());
        txtSummary.nextWord();
        txtSummary.appendText(n.getContent());
    });
    StringConverter<URLModel> converter = new StringConverter<URLModel>() {

        @Override
        public String toString(URLModel object) {
            return String.format("[%s] - %s", object.getTitle(), object.getUrl());
        }

        @Override
        public URLModel fromString(String string) {
            return null;
        }
    };
    cbSmmy.setCellFactory(param -> {
        TextFieldListCell<URLModel> textFieldListCell = new TextFieldListCell<>(converter);
        textFieldListCell.setMaxWidth(600d);
        textFieldListCell.setPrefWidth(600d);
        return textFieldListCell;
    });
    /** Size the combo-box drop down list. */
    cbSmmy.setConverter(converter);
    Platform.runLater(() -> {
        request(userData);
    // Platform.runLater(() -> {
    // WebEngine engine = webPreview.getEngine();
    // engine.getLoadWorker().stateProperty().addListener((ChangeListener<State>)
    // (ov, oldState, newState) -> {
    // LOGGER.debug("{} - {}", newState.name(), engine.getLocation());
    //
    // if (newState == Worker.State.RUNNING) {
    // String location = engine.getLocation();
    // if (ValueUtil.isNotEmpty(location)) {
    //
    // Class<? extends ExtractorBase> algorism = cbAlgorisms.getValue();
    // RealtimeSearchItemVO vo = new RealtimeSearchItemVO();
    // vo.setLink(location);
    // try {
    // updateMainContent(algorism, getHTMLContent(vo));
    // } catch (Exception e) {
    // e.printStackTrace();
    // }
    // }
    //
    // }
    // });
    // txtUrl.textProperty().addListener((oba, o, n) -> {
    //
    // if (ValueUtil.isNotEmpty(n)) {
    // RealtimeSearchItemVO realtimeSearchItemVO = new
    // RealtimeSearchItemVO();
    // realtimeSearchItemVO.setLink(n);
    // request(userData);
    // }
    //
    // });
    });
// engine.load(url);
// engine.getLoadWorker().messageProperty().addListener((oba, o, n) -> {
// LOGGER.debug("Browser Message : {}", n);
// });
// engine.setJavaScriptEnabled(true);
// HTML 코드를 engine에서 얻기위한 처리가 필요함.
// org.w3c.dom.Document doc = engine.getDocument();
// try {
// Transformer transformer =
// TransformerFactory.newInstance().newTransformer();
// transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
// transformer.setOutputProperty(OutputKeys.METHOD, "xml");
// transformer.setOutputProperty(OutputKeys.INDENT, "yes");
// transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
// transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount",
// "4");
//
// try (ByteArrayOutputStream outputStream = new
// ByteArrayOutputStream()) {
//
// try (OutputStreamWriter writer = new OutputStreamWriter(outputStream,
// "UTF-8")) {
// transformer.transform(new DOMSource(doc), new StreamResult(writer));
// Class<? extends ExtractorBase> algorism = cbAlgorisms.getValue();
// String boilderPipe = boilderPipe(algorism,
// outputStream.toString("UTF-8"));
// txtResult.setText(boilderPipe);
// }
// }
//
// } catch (Exception ex) {
// txtResult.setText(
// String.format("[%s] Something Problems Occured. \n\nStackTrace : {}",
// newState.name(), ValueUtil.toString(ex)));
// }
// } else {
// txtResult.setText("Waitings.... " + newState.name());
// }
// });
}

Also used : TextFieldListCell(javafx.scene.control.cell.TextFieldListCell) ExtractorBase(com.kohlschutter.boilerpipe.extractors.ExtractorBase) ListCell(javafx.scene.control.ListCell) TextFieldListCell(javafx.scene.control.cell.TextFieldListCell) RealtimeSearchItemVO(com.kyj.fx.voeditor.visual.framework.RealtimeSearchItemVO) StringConverter(javafx.util.StringConverter) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ListView(javafx.scene.control.ListView) URLModel(com.kyj.fx.voeditor.visual.framework.URLModel) FXML(javafx.fxml.FXML)

Example 3 with ExtractorBase

use of com.kohlschutter.boilerpipe.extractors.ExtractorBase in project Gargoyle by callakrsos.

the class ArticleExtractorComposite method updateNewRealContent.

private void updateNewRealContent(Collection<String> links) {
    URLModel[] array = links.parallelStream().map(link -> {
        URLModel model = URLModel.empty();
        try {
            BiFunction<InputStream, Charset, URLModel> response = new BiFunction<InputStream, Charset, URLModel>() {

                @Override
                public URLModel apply(InputStream is, Charset charset) {
                    URLModel urlModel = URLModel.empty();
                    try {
                        byte[] byteArray = IOUtils.toByteArray(is);
                        String content = ValueUtil.toString(byteArray, charset);
                        if (content == null)
                            return URLModel.empty();
                        Document parse = Jsoup.parse(content, "http");
                        Element head = parse.head();
                        Elements title = head.getElementsByTag("title");
                        List<Node> childNodes = head.childNodes();
                        for (Node n : childNodes) {
                            if ("meta".equals(n.nodeName())) {
                                String attr = n.attr("content");
                                if (ValueUtil.isNotEmpty(attr)) {
                                    String[] split = attr.split(";");
                                    if (split != null && split.length == 2) {
                                        String[] split2 = split[1].split("=");
                                        if (split2 != null && split2.length == 2) {
                                            Charset forName = Charset.forName(split2[1]);
                                            if (!charset.equals(forName)) {
                                                content = new String(byteArray, forName);
                                                break;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                        urlModel = new URLModel(link, content);
                        urlModel.setTitle(title.text());
                    } catch (IOException e) {
                        e.printStackTrace();
                    } finally {
                        try {
                            is.close();
                        } catch (Exception e) {
                            LOGGER.error(ValueUtil.toString(e));
                        }
                    }
                    return urlModel;
                }
            };
            model = RequestUtil.req200(new URL(link), response, false);
        } catch (Exception e) {
            return URLModel.empty();
        }
        return model;
    }).filter(v -> !v.isEmpty()).map(v -> {
        URLModel model = URLModel.empty();
        String url = v.getUrl();
        String content = v.getContent();
        ExtractorBase instance = null;
        if (url.contains("twitter.com")) {
            instance = KeepEverythingExtractor.INSTANCE;
        } else {
            instance = ArticleExtractor.getInstance();
        }
        InputSource source = new InputSource(new StringReader(content));
        source.setEncoding("UTF-8");
        try {
            content = ValueUtil.HTML.getNewsContent(instance, source);
            model.setUrl(v.getUrl());
            model.setContent(content);
            model.setTitle(v.getTitle());
        } catch (Exception e) {
            model = URLModel.empty();
            e.printStackTrace();
        }
        return model;
    }).filter(v -> !v.isEmpty()).toArray(URLModel[]::new);
    cbSmmy.getItems().clear();
    cbSmmy.getItems().addAll(array);
    ValueUtil.toTF_IDF(array).stream().map(mapper).reduce(accumulator).ifPresent(txtTfIdf::setText);
    cbSmmy.getSelectionModel().select(0);
}

Also used : URL(java.net.URL) ListCell(javafx.scene.control.ListCell) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) BoilerpipeSAXInput(com.kohlschutter.boilerpipe.sax.BoilerpipeSAXInput) KeyValue(com.kyj.fx.voeditor.visual.framework.KeyValue) ExtractorBase(com.kohlschutter.boilerpipe.extractors.ExtractorBase) RealtimeSearchItemVO(com.kyj.fx.voeditor.visual.framework.RealtimeSearchItemVO) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) Element(org.jsoup.nodes.Element) IOUtil(com.kyj.fx.voeditor.visual.util.IOUtil) JFXComboBox(com.jfoenix.controls.JFXComboBox) TextField(javafx.scene.control.TextField) ResponseHandler(com.kyj.fx.voeditor.visual.util.ResponseHandler) Collection(java.util.Collection) SingleSelectionModel(javafx.scene.control.SingleSelectionModel) FXMLController(com.kyj.fx.voeditor.visual.framework.annotation.FXMLController) Set(java.util.Set) Collectors(java.util.stream.Collectors) BinaryOperator(java.util.function.BinaryOperator) Platform(javafx.application.Platform) FXML(javafx.fxml.FXML) IOUtils(org.apache.commons.io.IOUtils) Node(org.jsoup.nodes.Node) FxUtil(com.kyj.fx.voeditor.visual.util.FxUtil) List(java.util.List) Document(org.jsoup.nodes.Document) Optional(java.util.Optional) Jsoup(org.jsoup.Jsoup) Elements(org.jsoup.select.Elements) BorderPane(javafx.scene.layout.BorderPane) ListView(javafx.scene.control.ListView) TextFieldListCell(javafx.scene.control.cell.TextFieldListCell) RequestUtil(com.kyj.fx.voeditor.visual.util.RequestUtil) Function(java.util.function.Function) ArrayList(java.util.ArrayList) URLModel(com.kyj.fx.voeditor.visual.framework.URLModel) Charset(java.nio.charset.Charset) Callback(javafx.util.Callback) JFXTextArea(com.jfoenix.controls.JFXTextArea) InputSource(org.xml.sax.InputSource) WebView(javafx.scene.web.WebView) ObjectProperty(javafx.beans.property.ObjectProperty) Logger(org.slf4j.Logger) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ValueUtil(com.kyj.fx.voeditor.visual.util.ValueUtil) StringConverter(javafx.util.StringConverter) KeepEverythingExtractor(com.kohlschutter.boilerpipe.extractors.KeepEverythingExtractor) StringReader(java.io.StringReader) SimpleObjectProperty(javafx.beans.property.SimpleObjectProperty) Collections(java.util.Collections) TextDocument(com.kohlschutter.boilerpipe.document.TextDocument) InputStream(java.io.InputStream) ArticleExtractor(com.kohlschutter.boilerpipe.extractors.ArticleExtractor) InputSource(org.xml.sax.InputSource) ExtractorBase(com.kohlschutter.boilerpipe.extractors.ExtractorBase) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Element(org.jsoup.nodes.Element) Node(org.jsoup.nodes.Node) Charset(java.nio.charset.Charset) IOException(java.io.IOException) Document(org.jsoup.nodes.Document) TextDocument(com.kohlschutter.boilerpipe.document.TextDocument) Elements(org.jsoup.select.Elements) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) URL(java.net.URL) BiFunction(java.util.function.BiFunction) StringReader(java.io.StringReader) URLModel(com.kyj.fx.voeditor.visual.framework.URLModel)

Example 4 with ExtractorBase

use of com.kohlschutter.boilerpipe.extractors.ExtractorBase in project Gargoyle by callakrsos.

the class TF_IDF method getString.

public void getString(Collection<String> links) {
    URLModel[] array = links.parallelStream().map(link -> {
        URLModel model = URLModel.empty();
        try {
            ResponseHandler<URLModel> responseHandler = new ResponseHandler<URLModel>() {

                @Override
                public URLModel apply(InputStream is, Integer code) {
                    if (code == 200) {
                        return new URLModel(link, ValueUtil.toString(is));
                    }
                    return URLModel.empty();
                }
            };
            if (link.startsWith("https")) {
                model = RequestUtil.requestSSL(new URL(link), responseHandler);
            } else {
                model = RequestUtil.request(new URL(link), responseHandler);
            }
        } catch (Exception e) {
            return URLModel.empty();
        }
        return model;
    }).filter(v -> !v.isEmpty()).map(v -> {
        String content = v.getContent();
        ExtractorBase instance = ArticleExtractor.getInstance();
        InputSource source = new InputSource(new StringReader(content));
        source.setEncoding("UTF-8");
        try {
            content = ValueUtil.HTML.getNewsContent(instance, source);
            v.setContent(content);
        } catch (Exception e) {
            v = URLModel.empty();
            e.printStackTrace();
        }
        return v;
    }).filter(v -> !v.isEmpty()).toArray(URLModel[]::new);
    List<KeyValue> tf_IDF = ValueUtil.toTF_IDF(array);
    tf_IDF.forEach(v -> {
        System.out.println(v.toString());
    });
}

Also used : URL(java.net.URL) RequestUtil(com.kyj.fx.voeditor.visual.util.RequestUtil) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) BoilerpipeSAXInput(com.kohlschutter.boilerpipe.sax.BoilerpipeSAXInput) KeyValue(com.kyj.fx.voeditor.visual.framework.KeyValue) ExtractorBase(com.kohlschutter.boilerpipe.extractors.ExtractorBase) URLModel(com.kyj.fx.voeditor.visual.framework.URLModel) Before(org.junit.Before) InputSource(org.xml.sax.InputSource) ProxyInitializable(com.kyj.fx.voeditor.visual.main.initalize.ProxyInitializable) Logger(org.slf4j.Logger) ResponseHandler(com.kyj.fx.voeditor.visual.util.ResponseHandler) MalformedURLException(java.net.MalformedURLException) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) Test(org.junit.Test) ValueUtil(com.kyj.fx.voeditor.visual.util.ValueUtil) ArticleSentencesExtractor(com.kohlschutter.boilerpipe.extractors.ArticleSentencesExtractor) Collectors(java.util.stream.Collectors) List(java.util.List) KeepEverythingExtractor(com.kohlschutter.boilerpipe.extractors.KeepEverythingExtractor) StringReader(java.io.StringReader) Document(org.jsoup.nodes.Document) Jsoup(org.jsoup.Jsoup) Elements(org.jsoup.select.Elements) Collections(java.util.Collections) TextDocument(com.kohlschutter.boilerpipe.document.TextDocument) InputStream(java.io.InputStream) ArticleExtractor(com.kohlschutter.boilerpipe.extractors.ArticleExtractor) InputSource(org.xml.sax.InputSource) KeyValue(com.kyj.fx.voeditor.visual.framework.KeyValue) ResponseHandler(com.kyj.fx.voeditor.visual.util.ResponseHandler) ExtractorBase(com.kohlschutter.boilerpipe.extractors.ExtractorBase) InputStream(java.io.InputStream) URL(java.net.URL) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) StringReader(java.io.StringReader) URLModel(com.kyj.fx.voeditor.visual.framework.URLModel)

Aggregations

ExtractorBase (com.kohlschutter.boilerpipe.extractors.ExtractorBase)4 RealtimeSearchItemVO (com.kyj.fx.voeditor.visual.framework.RealtimeSearchItemVO)3 URLModel (com.kyj.fx.voeditor.visual.framework.URLModel)3 IOException (java.io.IOException)3 MalformedURLException (java.net.MalformedURLException)3 TextDocument (com.kohlschutter.boilerpipe.document.TextDocument)2 ArticleExtractor (com.kohlschutter.boilerpipe.extractors.ArticleExtractor)2 KeepEverythingExtractor (com.kohlschutter.boilerpipe.extractors.KeepEverythingExtractor)2 BoilerpipeSAXInput (com.kohlschutter.boilerpipe.sax.BoilerpipeSAXInput)2 KeyValue (com.kyj.fx.voeditor.visual.framework.KeyValue)2 RequestUtil (com.kyj.fx.voeditor.visual.util.RequestUtil)2 ResponseHandler (com.kyj.fx.voeditor.visual.util.ResponseHandler)2 ValueUtil (com.kyj.fx.voeditor.visual.util.ValueUtil)2 InputStream (java.io.InputStream)2 StringReader (java.io.StringReader)2 URL (java.net.URL)2 Collection (java.util.Collection)2 Collections (java.util.Collections)2 List (java.util.List)2 Set (java.util.Set)2