Examples with Elements - org.jsoup.select.Elements

Example 46 with Elements

use of org.jsoup.select.Elements in project SpiderForUESTC by ClaudiusGitHub.

the class FindComent method findCommentByUid.

/**
     * 如果返回true，则还有下一页点评。
     * @param tid
     * @param page   第几页点评
     * @return
     */
public boolean findCommentByUid(String commentId, int tid, int page) {
    boolean result = false;
    String commentUrl = commentUrlPre + commentId + commentUrlMid + page + commentUrlSuf;
    HttpGet httpGet = new HttpGet(commentUrl);
    httpGet.setConfig(requestConfig);
    try {
        CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
        int times = 10;
        while (httpResponse.getStatusLine().getStatusCode() != 200 && times > 0) {
            httpResponse.close();
            httpResponse = httpClient.execute(httpGet);
            times--;
        }
        try {
            HttpEntity httpEntity = httpResponse.getEntity();
            String response = EntityUtils.toString(httpEntity);
            if (response.contains("下一页"))
                result = true;
            if (!response.contains("" + uid))
                return result;
            int length = response.length();
            response = response.substring(55, length - 10);
            Document document = Jsoup.parse(response);
            Elements elements = document.getElementsByClass("psti");
            for (Element element : elements) {
                if (element.toString().contains(uid + "")) {
                    System.out.print(element.text());
                    output(url + tid + "   " + element.text());
                }
            }
        } finally {
            httpResponse.close();
        }
    } catch (IOException e) {
        httpGet.releaseConnection();
        e.printStackTrace();
    }
    return result;
}

Also used : HttpEntity(org.apache.http.HttpEntity) HttpGet(org.apache.http.client.methods.HttpGet) Element(org.jsoup.nodes.Element) CloseableHttpResponse(org.apache.http.client.methods.CloseableHttpResponse) IOException(java.io.IOException) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements)

Example 47 with Elements

use of org.jsoup.select.Elements in project jabref by JabRef.

the class GoogleScholar method findFullText.

@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
    Objects.requireNonNull(entry);
    Optional<URL> pdfLink = Optional.empty();
    // Search in title
    if (!entry.hasField(FieldName.TITLE)) {
        return pdfLink;
    }
    try {
        URIBuilder uriBuilder = new URIBuilder(SEARCH_IN_TITLE_URL);
        uriBuilder.addParameter("as_q", "");
        uriBuilder.addParameter("as_epq", entry.getField(FieldName.TITLE).orElse(null));
        uriBuilder.addParameter("as_occt", "title");
        Document doc = Jsoup.connect(uriBuilder.toString()).userAgent(URLDownload.USER_AGENT).get();
        // TODO: link always on first result or none?
        for (int i = 0; i < NUM_RESULTS; i++) {
            Elements link = doc.select(String.format("#gs_ggsW%s a", i));
            if (link.first() != null) {
                String s = link.first().attr("href");
                // link present?
                if (!"".equals(s)) {
                    // TODO: check title inside pdf + length?
                    // TODO: report error function needed?! query -> result
                    LOGGER.info("Fulltext PDF found @ Google: " + s);
                    pdfLink = Optional.of(new URL(s));
                    break;
                }
            }
        }
    } catch (URISyntaxException e) {
        throw new FetcherException("Building URI failed.", e);
    }
    return pdfLink;
}

Also used : FetcherException(org.jabref.logic.importer.FetcherException) URISyntaxException(java.net.URISyntaxException) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements) URL(java.net.URL) URIBuilder(org.apache.http.client.utils.URIBuilder)

Example 48 with Elements

use of org.jsoup.select.Elements in project Gargoyle by callakrsos.

the class ArticleExtractorComposite method findAllLink.

/**
	 * @작성자 : KYJ
	 * @작성일 : 2016. 12. 5.
	 * @param content
	 * @return
	 */
private Collection<String> findAllLink(String content) {
    Set<String> collect = Collections.emptySet();
    try (ByteArrayInputStream in = new ByteArrayInputStream(content.getBytes())) {
        Document parse = Jsoup.parse(in, "UTF-8", "https");
        // Repository로 저장 관리할 필요성이 있음.
        /* a 태그 만 추출. */
        // parse.getElementsByTag("a");
        Elements elementsByTag = parse.getElementsByTag("a");
        collect = elementsByTag.stream().filter(e -> e.hasAttr("href")).map(e -> e.attr("href").trim()).filter(e -> e.startsWith("http") || e.startsWith("https")).filter(filterRepository.getFilter()).collect(Collectors.toSet());
    } catch (IOException e) {
        e.printStackTrace();
    }
    return collect;
}

Also used : URL(java.net.URL) ListCell(javafx.scene.control.ListCell) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) BoilerpipeSAXInput(com.kohlschutter.boilerpipe.sax.BoilerpipeSAXInput) KeyValue(com.kyj.fx.voeditor.visual.framework.KeyValue) ExtractorBase(com.kohlschutter.boilerpipe.extractors.ExtractorBase) RealtimeSearchItemVO(com.kyj.fx.voeditor.visual.framework.RealtimeSearchItemVO) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) Element(org.jsoup.nodes.Element) IOUtil(com.kyj.fx.voeditor.visual.util.IOUtil) JFXComboBox(com.jfoenix.controls.JFXComboBox) TextField(javafx.scene.control.TextField) ResponseHandler(com.kyj.fx.voeditor.visual.util.ResponseHandler) Collection(java.util.Collection) SingleSelectionModel(javafx.scene.control.SingleSelectionModel) FXMLController(com.kyj.fx.voeditor.visual.framework.annotation.FXMLController) Set(java.util.Set) Collectors(java.util.stream.Collectors) BinaryOperator(java.util.function.BinaryOperator) Platform(javafx.application.Platform) FXML(javafx.fxml.FXML) IOUtils(org.apache.commons.io.IOUtils) Node(org.jsoup.nodes.Node) FxUtil(com.kyj.fx.voeditor.visual.util.FxUtil) List(java.util.List) Document(org.jsoup.nodes.Document) Optional(java.util.Optional) Jsoup(org.jsoup.Jsoup) Elements(org.jsoup.select.Elements) BorderPane(javafx.scene.layout.BorderPane) ListView(javafx.scene.control.ListView) TextFieldListCell(javafx.scene.control.cell.TextFieldListCell) RequestUtil(com.kyj.fx.voeditor.visual.util.RequestUtil) Function(java.util.function.Function) ArrayList(java.util.ArrayList) URLModel(com.kyj.fx.voeditor.visual.framework.URLModel) Charset(java.nio.charset.Charset) Callback(javafx.util.Callback) JFXTextArea(com.jfoenix.controls.JFXTextArea) InputSource(org.xml.sax.InputSource) WebView(javafx.scene.web.WebView) ObjectProperty(javafx.beans.property.ObjectProperty) Logger(org.slf4j.Logger) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ValueUtil(com.kyj.fx.voeditor.visual.util.ValueUtil) StringConverter(javafx.util.StringConverter) KeepEverythingExtractor(com.kohlschutter.boilerpipe.extractors.KeepEverythingExtractor) StringReader(java.io.StringReader) SimpleObjectProperty(javafx.beans.property.SimpleObjectProperty) Collections(java.util.Collections) TextDocument(com.kohlschutter.boilerpipe.document.TextDocument) InputStream(java.io.InputStream) ArticleExtractor(com.kohlschutter.boilerpipe.extractors.ArticleExtractor) ByteArrayInputStream(java.io.ByteArrayInputStream) IOException(java.io.IOException) Document(org.jsoup.nodes.Document) TextDocument(com.kohlschutter.boilerpipe.document.TextDocument) Elements(org.jsoup.select.Elements)

Example 49 with Elements

use of org.jsoup.select.Elements in project Gargoyle by callakrsos.

the class NaverRealtimeSearchFactory method parse.

public List<RealtimeSearchVO> parse(String htmlBody) {
    Document doc = Jsoup.parse(htmlBody);
    Elements r = doc.select("[class*='realtime_srch']");
    Elements select = r.select(".lst_realtime_srch");
    List<RealtimeSearchVO> realtimeSearchItems = select.stream().map(e -> {
        RealtimeSearchVO realtimeSearchVO = null;
        Element previousElementSibling = e.previousElementSibling();
        if (previousElementSibling != null) {
            realtimeSearchVO = new RealtimeSearchVO();
            String text = previousElementSibling.text();
            if (text.length() >= 15) {
                realtimeSearchVO.setTitle(text.substring(0, 15) + "...");
            } else {
                realtimeSearchVO.setTitle(text);
            }
            Elements liTags = e.getElementsByTag("li");
            List<RealtimeSearchItemVO> items = liTags.stream().map(li -> {
                RealtimeSearchItemVO item = new RealtimeSearchItemVO();
                Element aTag = li.getElementsByTag("a").first();
                Elements elementsByAttribute = aTag.getElementsByAttribute("href");
                String url = elementsByAttribute.attr("href");
                Element numElement = li.getElementsByClass("num").first();
                String num = numElement.text();
                Element titElement = li.getElementsByClass("tit").first();
                String title = titElement.text();
                item.setRank(Integer.parseInt(num, 10));
                item.setKeyword(title);
                item.setLink(url);
                LOGGER.debug("title [{}] num [{}]  url : [{}] , toString : {}", title, num, url, li.toString());
                return item;
            }).collect(Collectors.toList());
            realtimeSearchVO.setItems(items);
        }
        return realtimeSearchVO;
    }).filter(v -> v != null).collect(Collectors.toList());
    return realtimeSearchItems;
}

Also used : Logger(org.slf4j.Logger) RealtimeSearchVO(com.kyj.fx.voeditor.visual.framework.RealtimeSearchVO) ResponseHandler(com.kyj.fx.voeditor.visual.util.ResponseHandler) MalformedURLException(java.net.MalformedURLException) URL(java.net.URL) RequestUtil(com.kyj.fx.voeditor.visual.util.RequestUtil) LoggerFactory(org.slf4j.LoggerFactory) ValueUtil(com.kyj.fx.voeditor.visual.util.ValueUtil) Supplier(java.util.function.Supplier) Collectors(java.util.stream.Collectors) JSONArray(org.json.simple.JSONArray) RealtimeSearchItemVO(com.kyj.fx.voeditor.visual.framework.RealtimeSearchItemVO) List(java.util.List) Document(org.jsoup.nodes.Document) Element(org.jsoup.nodes.Element) Jsoup(org.jsoup.Jsoup) Elements(org.jsoup.select.Elements) Collections(java.util.Collections) InputStream(java.io.InputStream) RealtimeSearchVO(com.kyj.fx.voeditor.visual.framework.RealtimeSearchVO) Element(org.jsoup.nodes.Element) RealtimeSearchItemVO(com.kyj.fx.voeditor.visual.framework.RealtimeSearchItemVO) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements)

Example 50 with Elements

use of org.jsoup.select.Elements in project CodeUtils by boredream.

the class Main method getAllAppLinks.

public static List<String> getAllAppLinks() throws Exception {
    List<String> links = new ArrayList<>();
    String response = HttpUtils.getString(hostUrl);
    Document parse = Jsoup.parse(response);
    Element appsElement = parse.getElementById("apps-dropdown");
    Elements appsItemElements = appsElement.getElementsByTag("a");
    for (Element element : appsItemElements) {
        String attr = element.attr("href");
        links.add(attr);
    }
    return links;
}

Also used : Element(org.jsoup.nodes.Element) ArrayList(java.util.ArrayList) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements)

Aggregations

Elements (org.jsoup.select.Elements)709 Element (org.jsoup.nodes.Element)490 Document (org.jsoup.nodes.Document)362 ArrayList (java.util.ArrayList)213 IOException (java.io.IOException)151 Test (org.junit.Test)110 URL (java.net.URL)58 List (java.util.List)47 Matcher (java.util.regex.Matcher)42 Pattern (java.util.regex.Pattern)34 HashMap (java.util.HashMap)30 InputStream (java.io.InputStream)29 Jsoup (org.jsoup.Jsoup)28 Configuration (com.vaadin.addon.charts.model.Configuration)27 File (java.io.File)26 JSONObject (org.json.JSONObject)26 JSONException (org.json.JSONException)25 Collectors (java.util.stream.Collectors)23 URISyntaxException (java.net.URISyntaxException)22 BootstrapContext (com.vaadin.flow.server.BootstrapHandler.BootstrapContext)20