当前位置: 首页 > news >正文

python学习 - 爬虫案例 - 爬取链接房产信息入数据库代码实例

#coding=utf-8
#!/usr/bin/python
# 导入requests库
import requests
# 导入文件操作库
import os
import re
import bs4
from bs4 import BeautifulSoup
import sys
from util.mysql_DBUtils import mysql# 写入数据库
def write_db(param):try:sql = "insert into house (url,housing_estate,position,square_metre,unit_price,total_price,follow,take_look,pub_date) "sql = sql + "VALUES(%(url)s,%(housing_estate)s, %(position)s,%(square_metre)s,"sql = sql + "%(unit_price)s,%(total_price)s,%(follow)s,%(take_look)s,%(pub_date)s)"mysql.insert(sql, param)except Exception as e:print(e)# 主方法
def main():# 给请求指定一个请求头来模拟chrome浏览器headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}page_max = 100# 爬图地址for i in range(1, int(page_max) + 1):print("第几页:" + str(i))if i == 1:house = 'https://qd.lianjia.com/ershoufang/shibei/'else:house = 'https://qd.lianjia.com/ershoufang/shibei/pg'+str(i)res = requests.get(house, headers=headers)soup = BeautifulSoup(res.text, 'html.parser')li_max = soup.find('ul', class_='sellListContent').find_all('li')for li in li_max:try:house_param = {}#  格式 荣馨苑  | 3室2厅 | 115.91平米 | 南 北 | 毛坯 | 无电梯content = li.find('div', class_='houseInfo').textcontent = content.split("|")house_param['housing_estate'] = content[0]house_param['square_metre'] = re.findall(r'-?\d+\.?\d*e?-?\d*?', content[2])[0]# --------------------------------------------------------##  位置 水清沟position = li.find('div', class_='positionInfo').find('a').texthouse_param['position'] = position# --------------------------------------------------------#totalprice = li.find('div', class_='totalPrice').texthouse_param['total_price'] = re.sub("\D", "", totalprice)unitprice = li.find('div', class_='unitPrice').texthouse_param['unit_price'] = re.sub("\D", "", unitprice)# --------------------------------------------------------## 57人关注 / 共13次带看 / 6个月以前发布follow = li.find('div', class_='followInfo').textfollow = follow.split("/")house_param['follow'] = re.sub("\D", "", follow[0])house_param['take_look'] = re.sub("\D", "", follow[1])# --------------------------------------------------------## 二手房地址title_src = li.find('div', class_='title').find('a').attrs['href']house_param['url'] = re.sub("\D", "", title_src)res = requests.get(title_src, headers=headers)soup = BeautifulSoup(res.text, 'html.parser')# --------------------------------------------------------## 挂牌时间(重要数据)pub_date = soup.find('div', class_='transaction').find_all('li')[0].find_all('span')[1].texthouse_param['pub_date'] = pub_datewrite_db(house_param)except Exception as e:print(e)mysql.end("commit")mysql.dispose()if __name__ == '__main__':main()
#!/usr/bin/python3
# -*- coding:utf-8 -*-
import pymysql, os, configparser
from pymysql.cursors import DictCursor
from DBUtils.PooledDB import PooledDBclass Config(object):"""# Config().get_content("user_information")配置文件里面的参数[dbMysql]host = 192.168.1.80port = 3306user = rootpassword = 123456"""def __init__(self, config_filename="dbMysqlConfig.cnf"):file_path = os.path.join(os.path.dirname(__file__), config_filename)self.cf = configparser.ConfigParser()self.cf.read(file_path)def get_sections(self):return self.cf.sections()def get_options(self, section):return self.cf.options(section)def get_content(self, section):result = {}for option in self.get_options(section):value = self.cf.get(section, option)result[option] = int(value) if value.isdigit() else valuereturn resultclass BasePymysqlPool(object):def __init__(self, host, port, user, password, db_name):self.db_host = hostself.db_port = int(port)self.user = userself.password = str(password)self.db = db_nameself.conn = Noneself.cursor = Noneclass MyPymysqlPool(BasePymysqlPool):"""MYSQL数据库对象,负责产生数据库连接 , 此类中的连接采用连接池实现获取连接对象:conn = Mysql.getConn()释放连接对象;conn.close()或del conn"""# 连接池对象__pool = Nonedef __init__(self, conf_name=None):self.conf = Config().get_content(conf_name)super(MyPymysqlPool, self).__init__(**self.conf)# 数据库构造函数,从连接池中取出连接,并生成操作游标self._conn = self.__getConn()self._cursor = self._conn.cursor()def __getConn(self):"""@summary: 静态方法,从连接池中取出连接@return MySQLdb.connection"""if MyPymysqlPool.__pool is None:__pool = PooledDB(creator=pymysql,mincached=1,maxcached=20,host=self.db_host,port=self.db_port,user=self.user,passwd=self.password,db=self.db,use_unicode=True,charset="utf8",cursorclass=DictCursor)return __pool.connection()def getAll(self, sql, param=None):"""@summary: 执行查询,并取出所有结果集@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来@param param: 可选参数,条件列表值(元组/列表)@return: result list(字典对象)/boolean 查询到的结果集"""if param is None:count = self._cursor.execute(sql)else:count = self._cursor.execute(sql, param)if count > 0:result = self._cursor.fetchall()else:result = Falsereturn resultdef getOne(self, sql, param=None):"""@summary: 执行查询,并取出第一条@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来@param param: 可选参数,条件列表值(元组/列表)@return: result list/boolean 查询到的结果集"""if param is None:count = self._cursor.execute(sql)else:count = self._cursor.execute(sql, param)if count > 0:result = self._cursor.fetchone()else:result = Falsereturn resultdef getMany(self, sql, num, param=None):"""@summary: 执行查询,并取出num条结果@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来@param num:取得的结果条数@param param: 可选参数,条件列表值(元组/列表)@return: result list/boolean 查询到的结果集"""if param is None:count = self._cursor.execute(sql)else:count = self._cursor.execute(sql, param)if count > 0:result = self._cursor.fetchmany(num)else:result = Falsereturn resultdef insertMany(self, sql, values):"""@summary: 向数据表插入多条记录@param sql:要插入的SQL格式@param values:要插入的记录数据tuple(tuple)/list[list]@return: count 受影响的行数"""count = self._cursor.executemany(sql, values)return countdef __query(self, sql, param=None):if param is None:count = self._cursor.execute(sql)else:count = self._cursor.execute(sql, param)return countdef update(self, sql, param=None):"""@summary: 更新数据表记录@param sql: SQL格式及条件,使用(%s,%s)@param param: 要更新的  值 tuple/list@return: count 受影响的行数"""return self.__query(sql, param)def insert(self, sql, param=None):"""@summary: 更新数据表记录@param sql: SQL格式及条件,使用(%s,%s)@param param: 要更新的  值 tuple/list@return: count 受影响的行数"""return self.__query(sql, param)def delete(self, sql, param=None):"""@summary: 删除数据表记录@param sql: SQL格式及条件,使用(%s,%s)@param param: 要删除的条件 值 tuple/list@return: count 受影响的行数"""return self.__query(sql, param)def begin(self):"""@summary: 开启事务"""self._conn.autocommit(0)def end(self, option='commit'):"""@summary: 结束事务"""if option == 'commit':self._conn.commit()else:self._conn.rollback()def dispose(self, isEnd=1):"""@summary: 释放连接池资源"""if isEnd == 1:self.end('commit')else:self.end('rollback')self._cursor.close()self._conn.close()mysql = MyPymysqlPool("dbMysql")if __name__ == '__main__':sqlAll = "select id, title from novel limit 2;"result = mysql.getAll(sqlAll)print(result)# 释放资源mysql.dispose()

[dbMysql]
host = localhost
port = 3306
user = root
password = 123456
db_name = house

http://www.lryc.cn/news/364690.html

相关文章:

  • Git 完整操作之记录
  • mediaPlayer的内存泄露解决方法
  • delphi3层 delphi 3层
  • Python编程学习第一篇——制作一个小游戏休闲一下
  • 03--nginx架构实战
  • 【力扣第 400 场周赛】Leetcode 删除星号以后字典序最小的字符串
  • Unity DOTS技术(九) BufferElement动态缓冲区组件
  • hnust 湖南科技大学 2022 软件测试报告+代码
  • 【面试笔记】单片机软件工程师,工业控制方向(储能)
  • 基于springboot实现小区团购管理系统项目【项目源码+论文说明】计算机毕业设计
  • 基于django | 创建数据库,实现增、删、查的功能
  • 数据结构与算法07-图
  • springboot项目部署需要redis集群问题
  • JVMの内存泄漏内存溢出案例分析
  • v31支架固定方式
  • Jenkins从入门到精通面试题及参考答案(3万字长文)
  • 如何使用电阻器?创建任何电阻的简单过程
  • 学Python,看一篇就够
  • 数据仓库核心:维度表设计的艺术与实践
  • SQL实验 连接查询和嵌套查询
  • 【JAVA WEB实用技巧与优化方案】Maven自动化构建与Maven 打包技巧
  • 详细分析Mysql中的SQL_MODE基本知识(附Demo讲解)
  • vue3+uniapp
  • 组织病理学结合人工智能之后,如何实际应用于临床?|顶刊精析·24-06-06
  • VCAST创建单元测试工程
  • 数据结构之归并排序算法【图文详解】
  • 设计模式基础
  • Glide支持通过url加载本地图标
  • 网络安全形势与WAF技术分享
  • 【实战JVM】-实战篇-06-GC调优