use of org.jsoup.nodes.Node in project Gargoyle by callakrsos.
the class ArticleExtractorComposite method updateNewRealContent.
private void updateNewRealContent(Collection<String> links) {
URLModel[] array = links.parallelStream().map(link -> {
URLModel model = URLModel.empty();
try {
BiFunction<InputStream, Charset, URLModel> response = new BiFunction<InputStream, Charset, URLModel>() {
@Override
public URLModel apply(InputStream is, Charset charset) {
URLModel urlModel = URLModel.empty();
try {
byte[] byteArray = IOUtils.toByteArray(is);
String content = ValueUtil.toString(byteArray, charset);
if (content == null)
return URLModel.empty();
Document parse = Jsoup.parse(content, "http");
Element head = parse.head();
Elements title = head.getElementsByTag("title");
List<Node> childNodes = head.childNodes();
for (Node n : childNodes) {
if ("meta".equals(n.nodeName())) {
String attr = n.attr("content");
if (ValueUtil.isNotEmpty(attr)) {
String[] split = attr.split(";");
if (split != null && split.length == 2) {
String[] split2 = split[1].split("=");
if (split2 != null && split2.length == 2) {
Charset forName = Charset.forName(split2[1]);
if (!charset.equals(forName)) {
content = new String(byteArray, forName);
break;
}
}
}
}
}
}
urlModel = new URLModel(link, content);
urlModel.setTitle(title.text());
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
is.close();
} catch (Exception e) {
LOGGER.error(ValueUtil.toString(e));
}
}
return urlModel;
}
};
model = RequestUtil.req200(new URL(link), response, false);
} catch (Exception e) {
return URLModel.empty();
}
return model;
}).filter(v -> !v.isEmpty()).map(v -> {
URLModel model = URLModel.empty();
String url = v.getUrl();
String content = v.getContent();
ExtractorBase instance = null;
if (url.contains("twitter.com")) {
instance = KeepEverythingExtractor.INSTANCE;
} else {
instance = ArticleExtractor.getInstance();
}
InputSource source = new InputSource(new StringReader(content));
source.setEncoding("UTF-8");
try {
content = ValueUtil.HTML.getNewsContent(instance, source);
model.setUrl(v.getUrl());
model.setContent(content);
model.setTitle(v.getTitle());
} catch (Exception e) {
model = URLModel.empty();
e.printStackTrace();
}
return model;
}).filter(v -> !v.isEmpty()).toArray(URLModel[]::new);
cbSmmy.getItems().clear();
cbSmmy.getItems().addAll(array);
ValueUtil.toTF_IDF(array).stream().map(mapper).reduce(accumulator).ifPresent(txtTfIdf::setText);
cbSmmy.getSelectionModel().select(0);
}
Aggregations