use of org.edamontology.pubfetcher.Webpage in project edammap by edamontology.
the class Processor method getProcessedQuery.
public QueryProcessed getProcessedQuery(Query query, QueryType type, PreProcessor pp, Idf queryIdf, FetcherArgs fetcherArgs) {
QueryProcessed queryProcessed = new QueryProcessed();
boolean removeBroken = (type == QueryType.Bioconductor);
if (query.getName() != null) {
List<String> nameTokens = pp.process(query.getName());
if (!nameTokens.isEmpty()) {
queryProcessed.setNameTokens(nameTokens);
if (queryIdf != null) {
queryProcessed.setNameIdfs(queryIdf.getIdf(nameTokens));
}
}
}
if (query.getKeywords() != null) {
for (Keyword keyword : query.getKeywords()) {
String keywordValue = keyword.getValue();
List<String> keywordTokens = null;
List<Double> keywordIdfs = null;
if (keywordValue != null) {
keywordTokens = pp.process(keywordValue);
if (keywordTokens.isEmpty()) {
keywordTokens = null;
} else if (queryIdf != null) {
keywordIdfs = queryIdf.getIdf(keywordTokens);
}
}
queryProcessed.addKeywordTokens(keywordTokens);
queryProcessed.addKeywordIdfs(keywordIdfs);
}
}
if (query.getDescription() != null) {
List<String> descriptionTokens = pp.process(query.getDescription());
if (!descriptionTokens.isEmpty()) {
queryProcessed.setDescriptionTokens(descriptionTokens);
if (queryIdf != null) {
queryProcessed.setDescriptionIdfs(queryIdf.getIdf(descriptionTokens));
}
}
}
if (query.getWebpageUrls() != null) {
for (Iterator<Link> it = query.getWebpageUrls().iterator(); it.hasNext(); ) {
String webpageUrl = it.next().getUrl();
Webpage webpage = FetcherCommon.getWebpage(webpageUrl, database, fetcher, fetcherArgs);
List<String> webpageTokens = null;
List<Double> webpageIdfs = null;
if (webpage != null && webpage.isUsable(fetcherArgs)) {
webpageTokens = pp.process(webpage.getTitle() + " " + webpage.getContent());
if (webpageTokens.isEmpty()) {
webpageTokens = null;
} else if (queryIdf != null) {
webpageIdfs = queryIdf.getIdf(webpageTokens);
}
}
if (webpageTokens == null && removeBroken) {
it.remove();
} else {
queryProcessed.addWebpage(webpage);
queryProcessed.addWebpageTokens(webpageTokens);
queryProcessed.addWebpageIdfs(webpageIdfs);
}
}
}
if (query.getDocUrls() != null) {
for (Iterator<Link> it = query.getDocUrls().iterator(); it.hasNext(); ) {
String docUrl = it.next().getUrl();
Webpage doc = FetcherCommon.getDoc(docUrl, database, fetcher, fetcherArgs);
List<String> docTokens = null;
List<Double> docIdfs = null;
if (doc != null && doc.isUsable(fetcherArgs)) {
docTokens = pp.process(doc.getTitle() + " " + doc.getContent());
if (docTokens.isEmpty()) {
docTokens = null;
} else if (queryIdf != null) {
docIdfs = queryIdf.getIdf(docTokens);
}
}
if (docTokens == null && removeBroken) {
it.remove();
} else {
queryProcessed.addDoc(doc);
queryProcessed.addDocTokens(docTokens);
queryProcessed.addDocIdfs(docIdfs);
}
}
}
if (query.getPublicationIds() != null) {
for (PublicationIdsQuery publicationIds : query.getPublicationIds()) {
Publication publication = FetcherCommon.getPublication(publicationIds, database, fetcher, null, fetcherArgs);
if (publication != null) {
queryProcessed.addPublication(publication);
queryProcessed.addProcessedPublication(processPublication(publication, pp, queryIdf, fetcherArgs));
} else {
queryProcessed.addPublication(null);
queryProcessed.addProcessedPublication(null);
}
}
}
return queryProcessed;
}
use of org.edamontology.pubfetcher.Webpage in project edammap by edamontology.
the class Report method writeLinks.
private static void writeLinks(FetcherArgs fetcherArgs, Writer writer, List<Link> webpageUrls, List<Webpage> webpages) throws IOException {
for (int i = 0; i < webpageUrls.size(); ++i) {
Link webpageUrl = webpageUrls.get(i);
Webpage webpage = webpages.get(i);
if (webpageUrl == null || webpageUrl.getUrl() == null || webpageUrl.getUrl().isEmpty())
continue;
String status = "";
if (webpage == null) {
status = "broken";
} else if (webpage.isBroken()) {
status = "broken";
} else if (webpage.isEmpty()) {
status = "empty";
} else if (!webpage.isFinal(fetcherArgs)) {
status = "non-final";
}
writer.write("\t\t\t\t\t<div class=\"with-meta\"><span" + (status.isEmpty() ? "" : " class=\"" + status + "\"") + ">");
writer.write(FetcherCommon.getLinkHtml(webpageUrl.getUrl()));
if (webpageUrl.getType() != null && !webpageUrl.getType().isEmpty()) {
writer.write(" <span class=\"link-type\">(" + FetcherCommon.escapeHtml(webpageUrl.getType()) + ")</span>");
}
if (!status.isEmpty()) {
writer.write(" (" + status + ")");
}
writer.write("</span><span class=\"spacer\"></span>");
if (webpage != null) {
writer.write("<span class=\"" + (status.isEmpty() ? "info" : "warning") + "\" tabindex=\"0\"></span>");
}
writer.write("\n");
if (webpage != null) {
writer.write("\t\t\t\t\t\t<div class=\"" + (status.isEmpty() ? "info" : "warning") + "-box\" tabindex=\"0\">\n");
writer.write("\t\t\t\t\t\t\t<h4><span" + (status.isEmpty() ? "" : " class=\"" + status + "\"") + ">");
writer.write(FetcherCommon.getLinkHtml(webpageUrl.getUrl()));
if (webpageUrl.getType() != null && !webpageUrl.getType().isEmpty()) {
writer.write(" <span class=\"link-type\">(" + FetcherCommon.escapeHtml(webpageUrl.getType()) + ")</span>");
}
if (!status.isEmpty()) {
writer.write(" (" + status + ")");
}
writer.write("</span></h4>\n");
writer.write(webpage.toStringMetaHtml("\t\t\t\t\t\t\t"));
writer.write("\n");
writer.write("\t\t\t\t\t\t</div>\n");
}
writer.write("\t\t\t\t\t</div>\n");
}
}
Aggregations