use of com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity in project springboot by LiJinHongPassion.
the class CrawlerImageUtil method defaultGetIPs.
/**
* 爬取代理ip + 端口:前三页
*/
public static void defaultGetIPs() {
Map<String, String> headers = new HashMap<>();
headers.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
// 解决乱码
// headers.put("accept-encoding", "gzip, deflate, br");
// headers.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36");
headers.put("user-agent", rl.getStringOfFile());
for (int i = 1; i < 4; i++) {
HttpRequest httpRequest = HttpRequest.get(ipurl + i).headers(headers);
// httpRequest.useProxy("171.35.162.12", 9999);
String s = httpRequest.body().replaceAll(" ", "").replaceAll("\r\n", "").replaceAll("\n", "").replaceAll("\t", "").replaceAll("\\\\", "");
// 现在创建 matcher 对象
Matcher m = Pattern.compile("<td>[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}</td><td>[0-9]{2,6}</td>").matcher(s);
while (m.find()) {
try {
String ip = m.group();
ip = ip.replaceAll("</td><td>", "|").replaceAll("<[/]*td>", "");
ips.add(new IPEntity(ip.substring(0, ip.indexOf("|")), ip.substring(ip.indexOf("|") + 1)));
} catch (Exception e) {
}
}
}
if (ips.size() > 0) {
StringBuilder re = new StringBuilder();
for (int j = 0; j < ips.size(); j++) {
re.append(ips.get(j).toString());
}
FileUtil.getFile(re.toString().getBytes(), "./env/ip/", "ips--" + System.currentTimeMillis() + ".txt");
}
}
use of com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity in project springboot by LiJinHongPassion.
the class CrawlerImageUtil method getRandomIPEntity.
/**
* 获取随机的ip
* @return
*/
public static Optional<IPEntity> getRandomIPEntity() {
if (ips.size() <= 0) {
File file = new File("./env/ip/");
File[] tempList = file.listFiles();
if (tempList != null && tempList.length > 0) {
String filename = tempList[0].getName();
String nowfilename = "";
for (File value : tempList) {
nowfilename = value.getName();
if (Long.parseLong(filename.substring(filename.lastIndexOf("--") + 2, filename.lastIndexOf(".txt"))) < Long.parseLong(nowfilename.substring(nowfilename.lastIndexOf("--") + 2, nowfilename.lastIndexOf(".txt")))) {
filename = nowfilename;
}
}
ipsRead.initList("./env/ip/" + filename);
ipsRead.getList().stream().forEach(str -> {
ips.add(new IPEntity(str.substring(0, str.indexOf("|")), str.substring(str.indexOf("|") + 1)));
});
}
}
if (ips.size() > 0) {
return Optional.of(ips.get((int) (Math.random() * ips.size())));
}
return Optional.of(new IPEntity("127.0.0.1", "80"));
}
use of com.example.li.springboot_crawler_demo.utils.img.entity.IPEntity in project springboot by LiJinHongPassion.
the class CrawlerImageUtil method getAllImgUrl.
/**
* 描述: 爬虫 -- 获取页面中所有图片链接
*
* @param url 非图片链接,例如:http://www.baidu.com/artical=424
* @param regx 正则表法式子
* @return java.util.List<java.lang.String>
* @author LJH-1755497577 2019/11/8 15:57
*/
public static Set<String> getAllImgUrl(String url, String regx, Map<String, String> headers) {
Optional<IPEntity> randomIPEntity = getRandomIPEntity();
HttpRequest httpRequest = HttpRequest.get(url);
// httpRequest.useProxy(randomIPEntity.get().getIp(), randomIPEntity.get().getPort());
httpRequest.headers(headers);
// httpRequest.trustAllCerts().trustAllHosts().ok();
String body = "";
try {
body = httpRequest.body();
} catch (HttpRequest.HttpRequestException e) {
System.out.println("获取图片链接失败 =====> " + url);
return null;
}
body = body.replaceAll(" ", "").replaceAll("\r\n", "").replaceAll("\t", "").replaceAll("\\\\", "");
// 创建 Pattern 对象
Pattern r = Pattern.compile(regx);
// 现在创建 matcher 对象
Matcher m = r.matcher(body);
// 创建list存储
Set<String> re = new HashSet<>();
while (m.find()) {
try {
re.add(m.group().replaceAll("((http|https|HTTP|HTTPS):)*//", "https://"));
} catch (Exception e) {
}
}
return re;
}
Aggregations