爬虫基础
maven pom
<dependencies><!--前端jqury--><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.16.1</version></dependency><!--http工具--><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpcore</artifactId><version>4.4.16</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.14</version></dependency><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.13.0</version></dependency> </dependencies>
====================================
遍历网站内容爬取网站网址
package com.xiaocao;import com.sun.org.apache.bcel.internal.generic.NEW;
import com.sun.org.apache.regexp.internal.RE;import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class UrlPool {public static void main(String[] args) {/*首页地址*/getUrl("https://www.nipic.com/");}private static void getUrl(String baseUrl) {Map<String, Boolean> oldMap = new LinkedHashMap<>();/*相对路径拼接*/String oldLinkHost = "";Pattern p = Pattern.compile("(https?://)?[^\\s]*");Matcher m = p.matcher(baseUrl);if (m.find()) {oldLinkHost = m.group();}oldMap.put(baseUrl, false);oldMap = crawlLinks(oldLinkHost, oldMap);for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {System.out.println("连接:" + mapping.getKey());}}private static Map<String, Boolean> crawlLinks(String oldLinkHost, Map<String, Boolean> oldMap) {LinkedHashMap<String, Boolean> newMap = new LinkedHashMap<>();String oldLink = "";for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {if (!mapping.getValue()){System.out.println(mapping.getKey()+"连接有参数:" + mapping.getKey());oldLink = mapping.getKey();try {URL url = new URL(oldLink);HttpURLConnection connection = (HttpURLConnection) url.openConnection();connection.setRequestMethod("GET");if (connection.getResponseCode() == 200) {BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
// Pattern p = Pattern.compile("<a.*?href=[\"']?(https?://)?/?[^\"']?.*?>(.+)</a>");Pattern p = Pattern.compile("<a\\b[^>]+\\bhref=\"([^\"]*)\"[^>]*>([\\s\\S]*?)</a>");Matcher matcher = null;String line = "";while ((line = reader.readLine()) != null) {matcher = p.matcher(line);if (matcher.find()) {String newLink = matcher.group(1);if (!newLink.startsWith("http")) {/*相对路径*/if (newLink.startsWith("/")) {newLink = oldLinkHost + newLink;} else {newLink = oldLinkHost + "/" + newLink;}}if (newLink.endsWith("/")) {newLink = newLink.substring(0, newLink.length() - 1);}if (!oldMap.containsKey(newLink) && !newMap.containsKey(newLink) && newLink.startsWith(oldLinkHost)) {newMap.put(newLink, false);}}}}} catch (Exception e) {} finally {oldMap.replace(oldLink, true);}}}if (!newMap.isEmpty()) {oldMap.putAll(newMap);oldMap.putAll(crawlLinks(oldLinkHost, oldMap));}return oldMap;}
}
==============
下载网站内容
package com.xiaocao;import org.apache.commons.io.FileUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import sun.net.www.http.HttpClient;import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.HttpCookie;public class ImageCraw {private static String url = "https://xxx";public static void main(String[] args) { // apacheHttpClient();try {Document document = Jsoup.connect(url).get();Elements select = document.select(".newdetail-skin #J_worksImg");try {Connection.Response src = Jsoup.connect("https:"+select.attr("src")).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0").ignoreContentType(true).execute();String name = select.attr("alt");System.out.println(name);ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes());FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg"));}catch (Exception e){e.printStackTrace();}// for (int i = 0; i < select.size(); i++) { // Elements img = select.get(i).select(".newdetail-skin #J_worksImg"); // // try { // Connection.Response src = Jsoup.connect("https:"+img.attr("src")) // .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0") // .ignoreContentType(true) // .execute(); // // String name = img.attr("alt"); // System.out.println(name); // ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes()); // FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg")); // }catch (Exception e){ // e.printStackTrace(); // } // // }} catch (IOException e) {e.printStackTrace();}}private static void apacheHttpClient() {CloseableHttpClient client = HttpClients.createDefault();HttpGet httpGet = new HttpGet(url);/*伪装浏览器*/httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0");try {CloseableHttpResponse execute = client.execute(httpGet);HttpEntity entity = execute.getEntity();String s = EntityUtils.toString(entity);System.out.println(s);} catch (IOException e) {e.printStackTrace();}} }