use of com.kohlschutter.boilerpipe.extractors.ExtractorBase in project Gargoyle by callakrsos.
the class ArticleExtractorComposite method request.
public void request(String url) {
SingleSelectionModel<Class<? extends ExtractorBase>> selectionModel = cbAlgorisms.getSelectionModel();
Class<? extends ExtractorBase> selectAlgorism = selectionModel.getSelectedItem();
if (selectAlgorism != null) {
RealtimeSearchItemVO vo = new RealtimeSearchItemVO();
vo.setLink(url);
request(selectAlgorism, vo);
}
}
use of com.kohlschutter.boilerpipe.extractors.ExtractorBase in project Gargoyle by callakrsos.
the class ArticleExtractorComposite method initialize.
@FXML
public void initialize() {
cbAlgorisms.getItems().addAll(ValueUtil.HTML.getAvaliablesExtractorBase());
cbAlgorisms.getSelectionModel().select(ArticleExtractor.class);
cbAlgorisms.setCellFactory(new Callback<ListView<Class<? extends ExtractorBase>>, ListCell<Class<? extends ExtractorBase>>>() {
@Override
public ListCell<Class<? extends ExtractorBase>> call(ListView<Class<? extends ExtractorBase>> param) {
return new TextFieldListCell<>(new StringConverter<Class<? extends ExtractorBase>>() {
@Override
public String toString(Class<? extends ExtractorBase> object) {
return object.getSimpleName();
}
@Override
public Class<? extends ExtractorBase> fromString(String string) {
// TODO Auto-generated method stub
return null;
}
});
}
});
cbAlgorisms.setConverter(new StringConverter<Class<? extends ExtractorBase>>() {
@Override
public String toString(Class<? extends ExtractorBase> object) {
return object.getSimpleName();
}
@Override
public Class<? extends ExtractorBase> fromString(String string) {
// TODO Auto-generated method stub
return null;
}
});
cbAlgorisms.valueProperty().addListener((oba, o, n) -> {
Class<? extends ExtractorBase> algorism = n;
// webPreview.getEngine().getDocument().getBaseURI();
String baseURI = txtUrl.getText();
if (ValueUtil.isEmpty(baseURI))
return;
RealtimeSearchItemVO vo = new RealtimeSearchItemVO();
vo.setLink(baseURI);
try {
URLModel htmlContent = getHTMLContent(vo);
if (!htmlContent.isEmpty()) {
String boilderPipe = boilderPipe(algorism, htmlContent.getContent());
txtResult.setText(boilderPipe);
}
} catch (Exception e) {
e.printStackTrace();
}
});
cbSmmy.valueProperty().addListener((oba, o, n) -> {
txtSummary.setText(n.getUrl());
txtSummary.nextWord();
txtSummary.appendText(n.getContent());
});
StringConverter<URLModel> converter = new StringConverter<URLModel>() {
@Override
public String toString(URLModel object) {
return String.format("[%s] - %s", object.getTitle(), object.getUrl());
}
@Override
public URLModel fromString(String string) {
return null;
}
};
cbSmmy.setCellFactory(param -> {
TextFieldListCell<URLModel> textFieldListCell = new TextFieldListCell<>(converter);
textFieldListCell.setMaxWidth(600d);
textFieldListCell.setPrefWidth(600d);
return textFieldListCell;
});
/** Size the combo-box drop down list. */
cbSmmy.setConverter(converter);
Platform.runLater(() -> {
request(userData);
// Platform.runLater(() -> {
// WebEngine engine = webPreview.getEngine();
// engine.getLoadWorker().stateProperty().addListener((ChangeListener<State>)
// (ov, oldState, newState) -> {
// LOGGER.debug("{} - {}", newState.name(), engine.getLocation());
//
// if (newState == Worker.State.RUNNING) {
// String location = engine.getLocation();
// if (ValueUtil.isNotEmpty(location)) {
//
// Class<? extends ExtractorBase> algorism = cbAlgorisms.getValue();
// RealtimeSearchItemVO vo = new RealtimeSearchItemVO();
// vo.setLink(location);
// try {
// updateMainContent(algorism, getHTMLContent(vo));
// } catch (Exception e) {
// e.printStackTrace();
// }
// }
//
// }
// });
// txtUrl.textProperty().addListener((oba, o, n) -> {
//
// if (ValueUtil.isNotEmpty(n)) {
// RealtimeSearchItemVO realtimeSearchItemVO = new
// RealtimeSearchItemVO();
// realtimeSearchItemVO.setLink(n);
// request(userData);
// }
//
// });
});
// engine.load(url);
// engine.getLoadWorker().messageProperty().addListener((oba, o, n) -> {
// LOGGER.debug("Browser Message : {}", n);
// });
// engine.setJavaScriptEnabled(true);
// HTML 코드를 engine에서 얻기위한 처리가 필요함.
// org.w3c.dom.Document doc = engine.getDocument();
// try {
// Transformer transformer =
// TransformerFactory.newInstance().newTransformer();
// transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
// transformer.setOutputProperty(OutputKeys.METHOD, "xml");
// transformer.setOutputProperty(OutputKeys.INDENT, "yes");
// transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
// transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount",
// "4");
//
// try (ByteArrayOutputStream outputStream = new
// ByteArrayOutputStream()) {
//
// try (OutputStreamWriter writer = new OutputStreamWriter(outputStream,
// "UTF-8")) {
// transformer.transform(new DOMSource(doc), new StreamResult(writer));
// Class<? extends ExtractorBase> algorism = cbAlgorisms.getValue();
// String boilderPipe = boilderPipe(algorism,
// outputStream.toString("UTF-8"));
// txtResult.setText(boilderPipe);
// }
// }
//
// } catch (Exception ex) {
// txtResult.setText(
// String.format("[%s] Something Problems Occured. \n\nStackTrace : {}",
// newState.name(), ValueUtil.toString(ex)));
// }
// } else {
// txtResult.setText("Waitings.... " + newState.name());
// }
// });
}
use of com.kohlschutter.boilerpipe.extractors.ExtractorBase in project Gargoyle by callakrsos.
the class ArticleExtractorComposite method updateNewRealContent.
private void updateNewRealContent(Collection<String> links) {
URLModel[] array = links.parallelStream().map(link -> {
URLModel model = URLModel.empty();
try {
BiFunction<InputStream, Charset, URLModel> response = new BiFunction<InputStream, Charset, URLModel>() {
@Override
public URLModel apply(InputStream is, Charset charset) {
URLModel urlModel = URLModel.empty();
try {
byte[] byteArray = IOUtils.toByteArray(is);
String content = ValueUtil.toString(byteArray, charset);
if (content == null)
return URLModel.empty();
Document parse = Jsoup.parse(content, "http");
Element head = parse.head();
Elements title = head.getElementsByTag("title");
List<Node> childNodes = head.childNodes();
for (Node n : childNodes) {
if ("meta".equals(n.nodeName())) {
String attr = n.attr("content");
if (ValueUtil.isNotEmpty(attr)) {
String[] split = attr.split(";");
if (split != null && split.length == 2) {
String[] split2 = split[1].split("=");
if (split2 != null && split2.length == 2) {
Charset forName = Charset.forName(split2[1]);
if (!charset.equals(forName)) {
content = new String(byteArray, forName);
break;
}
}
}
}
}
}
urlModel = new URLModel(link, content);
urlModel.setTitle(title.text());
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
is.close();
} catch (Exception e) {
LOGGER.error(ValueUtil.toString(e));
}
}
return urlModel;
}
};
model = RequestUtil.req200(new URL(link), response, false);
} catch (Exception e) {
return URLModel.empty();
}
return model;
}).filter(v -> !v.isEmpty()).map(v -> {
URLModel model = URLModel.empty();
String url = v.getUrl();
String content = v.getContent();
ExtractorBase instance = null;
if (url.contains("twitter.com")) {
instance = KeepEverythingExtractor.INSTANCE;
} else {
instance = ArticleExtractor.getInstance();
}
InputSource source = new InputSource(new StringReader(content));
source.setEncoding("UTF-8");
try {
content = ValueUtil.HTML.getNewsContent(instance, source);
model.setUrl(v.getUrl());
model.setContent(content);
model.setTitle(v.getTitle());
} catch (Exception e) {
model = URLModel.empty();
e.printStackTrace();
}
return model;
}).filter(v -> !v.isEmpty()).toArray(URLModel[]::new);
cbSmmy.getItems().clear();
cbSmmy.getItems().addAll(array);
ValueUtil.toTF_IDF(array).stream().map(mapper).reduce(accumulator).ifPresent(txtTfIdf::setText);
cbSmmy.getSelectionModel().select(0);
}
use of com.kohlschutter.boilerpipe.extractors.ExtractorBase in project Gargoyle by callakrsos.
the class TF_IDF method getString.
public void getString(Collection<String> links) {
URLModel[] array = links.parallelStream().map(link -> {
URLModel model = URLModel.empty();
try {
ResponseHandler<URLModel> responseHandler = new ResponseHandler<URLModel>() {
@Override
public URLModel apply(InputStream is, Integer code) {
if (code == 200) {
return new URLModel(link, ValueUtil.toString(is));
}
return URLModel.empty();
}
};
if (link.startsWith("https")) {
model = RequestUtil.requestSSL(new URL(link), responseHandler);
} else {
model = RequestUtil.request(new URL(link), responseHandler);
}
} catch (Exception e) {
return URLModel.empty();
}
return model;
}).filter(v -> !v.isEmpty()).map(v -> {
String content = v.getContent();
ExtractorBase instance = ArticleExtractor.getInstance();
InputSource source = new InputSource(new StringReader(content));
source.setEncoding("UTF-8");
try {
content = ValueUtil.HTML.getNewsContent(instance, source);
v.setContent(content);
} catch (Exception e) {
v = URLModel.empty();
e.printStackTrace();
}
return v;
}).filter(v -> !v.isEmpty()).toArray(URLModel[]::new);
List<KeyValue> tf_IDF = ValueUtil.toTF_IDF(array);
tf_IDF.forEach(v -> {
System.out.println(v.toString());
});
}
Aggregations