Search in sources :

Example 1 with IPEntity

use of com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity in project springboot by LiJinHongPassion.

the class CrawlerImageUtil method defaultGetIPs.

/**
 * 爬取代理ip + 端口:前三页
 */
public static void defaultGetIPs() {
    Map<String, String> headers = new HashMap<>();
    headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
    // 解决乱码
    // headers.put("accept-encoding", "gzip, deflate, br");
    // headers.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36");
    headers.put("user-agent", rl.getStringOfFile());
    for (int i = 1; i < 4; i++) {
        HttpRequest httpRequest = HttpRequest.get(ipurl + i).headers(headers);
        // httpRequest.useProxy("171.35.162.12", 9999);
        String s = httpRequest.body().replaceAll(" ", "").replaceAll("\r\n", "").replaceAll("\n", "").replaceAll("\t", "").replaceAll("\\\\", "");
        // 现在创建 matcher 对象
        Matcher m = Pattern.compile("<td>[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}</td><td>[0-9]{2,6}</td>").matcher(s);
        while (m.find()) {
            try {
                String ip = m.group();
                ip = ip.replaceAll("</td><td>", "|").replaceAll("<[/]*td>", "");
                ips.add(new IPEntity(ip.substring(0, ip.indexOf("|")), ip.substring(ip.indexOf("|") + 1)));
            } catch (Exception e) {
            }
        }
    }
    if (ips.size() > 0) {
        StringBuilder re = new StringBuilder();
        for (int j = 0; j < ips.size(); j++) {
            re.append(ips.get(j).toString());
        }
        FileUtil.getFile(re.toString().getBytes(), "./env/ip/", "ips--" + System.currentTimeMillis() + ".txt");
    }
}
Also used : HttpRequest(com.github.kevinsawicki.http.HttpRequest) IPEntity(com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity) Matcher(java.util.regex.Matcher)

Example 2 with IPEntity

use of com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity in project springboot by LiJinHongPassion.

the class CrawlerImageUtil method getRandomIPEntity.

/**
 * 获取随机的ip
 * @return
 */
public static Optional<IPEntity> getRandomIPEntity() {
    if (ips.size() <= 0) {
        File file = new File("./env/ip/");
        File[] tempList = file.listFiles();
        if (tempList != null && tempList.length > 0) {
            String filename = tempList[0].getName();
            String nowfilename = "";
            for (File value : tempList) {
                nowfilename = value.getName();
                if (Long.parseLong(filename.substring(filename.lastIndexOf("--") + 2, filename.lastIndexOf(".txt"))) < Long.parseLong(nowfilename.substring(nowfilename.lastIndexOf("--") + 2, nowfilename.lastIndexOf(".txt")))) {
                    filename = nowfilename;
                }
            }
            ipsRead.initList("./env/ip/" + filename);
            ipsRead.getList().stream().forEach(str -> {
                ips.add(new IPEntity(str.substring(0, str.indexOf("|")), str.substring(str.indexOf("|") + 1)));
            });
        }
    }
    if (ips.size() > 0) {
        return Optional.of(ips.get((int) (Math.random() * ips.size())));
    }
    return Optional.of(new IPEntity("127.0.0.1", "80"));
}
Also used : IPEntity(com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity) File(java.io.File) FileUtil.getFile(com.example.li.springboot_crawler_demo.utils.img.fileMsg.FileUtil.getFile)

Example 3 with IPEntity

use of com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity in project springboot by LiJinHongPassion.

the class CrawlerImageUtil method getAllImgUrl.

/**
 * 描述: 爬虫 -- 获取页面中所有图片链接
 *
 * @param url  非图片链接,例如:http://www.baidu.com/artical=424
 * @param regx 正则表法式子
 * @return java.util.List<java.lang.String>
 * @author LJH-1755497577 2019/11/8 15:57
 */
public static Set<String> getAllImgUrl(String url, String regx, Map<String, String> headers) {
    Optional<IPEntity> randomIPEntity = getRandomIPEntity();
    HttpRequest httpRequest = HttpRequest.get(url);
    // httpRequest.useProxy(randomIPEntity.get().getIp(), randomIPEntity.get().getPort());
    httpRequest.headers(headers);
    // httpRequest.trustAllCerts().trustAllHosts().ok();
    String body = "";
    try {
        body = httpRequest.body();
    } catch (HttpRequest.HttpRequestException e) {
        System.out.println("获取图片链接失败   =====>  " + url);
        return null;
    }
    body = body.replaceAll(" ", "").replaceAll("\r\n", "").replaceAll("\t", "").replaceAll("\\\\", "");
    // 创建 Pattern 对象
    Pattern r = Pattern.compile(regx);
    // 现在创建 matcher 对象
    Matcher m = r.matcher(body);
    // 创建list存储
    Set<String> re = new HashSet<>();
    while (m.find()) {
        try {
            re.add(m.group().replaceAll("((http|https|HTTP|HTTPS):)*//", "https://"));
        } catch (Exception e) {
        }
    }
    return re;
}
Also used : HttpRequest(com.github.kevinsawicki.http.HttpRequest) IPEntity(com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity) Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher)

Aggregations

IPEntity (com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity)3 HttpRequest (com.github.kevinsawicki.http.HttpRequest)2 Matcher (java.util.regex.Matcher)2 FileUtil.getFile (com.example.li.springboot_crawler_demo.utils.img.fileMsg.FileUtil.getFile)1 File (java.io.File)1 Pattern (java.util.regex.Pattern)1