当前位置: 首页 > news >正文

爬虫基础

maven pom

<dependencies><!--前端jqury--><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.16.1</version></dependency><!--http工具--><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpcore</artifactId><version>4.4.16</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.14</version></dependency><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.13.0</version></dependency>
</dependencies>

====================================

遍历网站内容爬取网站网址

package com.xiaocao;import com.sun.org.apache.bcel.internal.generic.NEW;
import com.sun.org.apache.regexp.internal.RE;import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class UrlPool {public static void main(String[] args) {/*首页地址*/getUrl("https://www.nipic.com/");}private static void getUrl(String baseUrl) {Map<String, Boolean> oldMap = new LinkedHashMap<>();/*相对路径拼接*/String oldLinkHost = "";Pattern p = Pattern.compile("(https?://)?[^\\s]*");Matcher m = p.matcher(baseUrl);if (m.find()) {oldLinkHost = m.group();}oldMap.put(baseUrl, false);oldMap = crawlLinks(oldLinkHost, oldMap);for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {System.out.println("连接:" + mapping.getKey());}}private static Map<String, Boolean> crawlLinks(String oldLinkHost, Map<String, Boolean> oldMap) {LinkedHashMap<String, Boolean> newMap = new LinkedHashMap<>();String oldLink = "";for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {if (!mapping.getValue()){System.out.println(mapping.getKey()+"连接有参数:" + mapping.getKey());oldLink = mapping.getKey();try {URL url = new URL(oldLink);HttpURLConnection connection = (HttpURLConnection) url.openConnection();connection.setRequestMethod("GET");if (connection.getResponseCode() == 200) {BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
//                        Pattern p = Pattern.compile("<a.*?href=[\"']?(https?://)?/?[^\"']?.*?>(.+)</a>");Pattern p = Pattern.compile("<a\\b[^>]+\\bhref=\"([^\"]*)\"[^>]*>([\\s\\S]*?)</a>");Matcher matcher = null;String line = "";while ((line = reader.readLine()) != null) {matcher = p.matcher(line);if (matcher.find()) {String newLink = matcher.group(1);if (!newLink.startsWith("http")) {/*相对路径*/if (newLink.startsWith("/")) {newLink = oldLinkHost + newLink;} else {newLink = oldLinkHost + "/" + newLink;}}if (newLink.endsWith("/")) {newLink = newLink.substring(0, newLink.length() - 1);}if (!oldMap.containsKey(newLink) && !newMap.containsKey(newLink) && newLink.startsWith(oldLinkHost)) {newMap.put(newLink, false);}}}}} catch (Exception e) {} finally {oldMap.replace(oldLink, true);}}}if (!newMap.isEmpty()) {oldMap.putAll(newMap);oldMap.putAll(crawlLinks(oldLinkHost, oldMap));}return oldMap;}
}

==============

下载网站内容

package com.xiaocao;import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sun.net.www.http.HttpClient;import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpCookie;public class ImageCraw {private static String url = "https://xxx";public static void main(String[] args) {
//        apacheHttpClient();try {Document document = Jsoup.connect(url).get();Elements select = document.select(".newdetail-skin #J_worksImg");try {Connection.Response src = Jsoup.connect("https:"+select.attr("src")).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0").ignoreContentType(true).execute();String name = select.attr("alt");System.out.println(name);ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes());FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg"));}catch (Exception e){e.printStackTrace();}//            for (int i = 0; i < select.size(); i++) {
//                Elements img = select.get(i).select(".newdetail-skin #J_worksImg");
//
//                try {
//                    Connection.Response src = Jsoup.connect("https:"+img.attr("src"))
//                            .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0")
//                            .ignoreContentType(true)
//                            .execute();
//
//                    String name = img.attr("alt");
//                    System.out.println(name);
//                    ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes());
//                    FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg"));
//                }catch (Exception e){
//                    e.printStackTrace();
//                }
//
//            }} catch (IOException e) {e.printStackTrace();}}private static void apacheHttpClient() {CloseableHttpClient client = HttpClients.createDefault();HttpGet httpGet = new HttpGet(url);/*伪装浏览器*/httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0");try {CloseableHttpResponse execute = client.execute(httpGet);HttpEntity entity = execute.getEntity();String s = EntityUtils.toString(entity);System.out.println(s);} catch (IOException e) {e.printStackTrace();}}
}

http://www.lryc.cn/news/467636.html

相关文章:

  • HTML3D旋转相册
  • [linux]快速入门
  • 域3:安全工程 第6章 密码学与对称密钥算法
  • MySQL注入load_file常用路径
  • ubuntu20.04版本 快速安装 python3.11(宝宝级攻略)
  • DeepSeek AI 推出 Janus 自回归框架,统一视觉、文本理解与生成的创新解决方案
  • NORDIC nPM1100 是一款集成式电源管理
  • 深入RAG:知识密集型NLP任务的解决方案
  • vue-element-admin顶部导航栏的修改
  • 微信小程序 setData数据量过大的解决与分页加载的实现
  • 体育动画直播嵌入方式以及作用
  • 腾讯云轻量服务器Lighthouse的前世今生
  • java实现redis的消息发送和消费,类似kafka功能
  • 【软件设计】常用设计模式--代理模式
  • 生命与自由,抑郁的来源
  • CTFHUB技能树之文件上传——双写后缀
  • SpringBoot整合HTTPS
  • LVGL-从入门到熟练使用
  • 【MySQL数据库】MySQL读写分离
  • 深度学习:简单计算图的反向传播传递导数计算
  • 学习AJAX请求(初步)24.10.21-10.23
  • 初识算法——二分查找
  • 深入剖析 Java Spring 中的 @Autowired、@Resource、@Qualifier、@Inject 注解:使用详解与注意事项
  • ThingsBoard规则链节点:Delete Attributes节点详解
  • 关于作为面试官以及如何准备面试的一些心得
  • Bean对象 和 普通对象 的区别
  • lego-loam featureAssociation 源码注释(二)
  • Claude 3.5 的六大应用场景
  • 进程线程知识总结
  • Rsync数据复制/备份服务应用