use of com.github.kevinsawicki.http.HttpRequest in project springboot by LiJinHongPassion.
the class CrawlerImageUtil method getAllImgUrl.
/**
* 描述: 爬虫 -- 获取页面中所有图片链接
*
* @param url 非图片链接,例如:http://www.baidu.com/artical=424
* @param regx 正则表法式子
* @return java.util.List<java.lang.String>
* @author LJH-1755497577 2019/11/8 15:57
*/
public static Set<String> getAllImgUrl(String url, String regx, Map<String, String> headers) {
Optional<IPEntity> randomIPEntity = getRandomIPEntity();
HttpRequest httpRequest = HttpRequest.get(url);
// httpRequest.useProxy(randomIPEntity.get().getIp(), randomIPEntity.get().getPort());
httpRequest.headers(headers);
// httpRequest.trustAllCerts().trustAllHosts().ok();
String body = "";
try {
body = httpRequest.body();
} catch (HttpRequest.HttpRequestException e) {
System.out.println("获取图片链接失败 =====> " + url);
return null;
}
body = body.replaceAll(" ", "").replaceAll("\r\n", "").replaceAll("\t", "").replaceAll("\\\\", "");
// 创建 Pattern 对象
Pattern r = Pattern.compile(regx);
// 现在创建 matcher 对象
Matcher m = r.matcher(body);
// 创建list存储
Set<String> re = new HashSet<>();
while (m.find()) {
try {
re.add(m.group().replaceAll("((http|https|HTTP|HTTPS):)*//", "https://"));
} catch (Exception e) {
}
}
return re;
}
Aggregations