python学习 - 爬虫案例 - 爬取链接房产信息入数据库代码实例
#coding=utf-8
#!/usr/bin/python
# 导入requests库
import requests
# 导入文件操作库
import os
import re
import bs4
from bs4 import BeautifulSoup
import sys
from util.mysql_DBUtils import mysql# 写入数据库
def write_db(param):try:sql = "insert into house (url,housing_estate,position,square_metre,unit_price,total_price,follow,take_look,pub_date) "sql = sql + "VALUES(%(url)s,%(housing_estate)s, %(position)s,%(square_metre)s,"sql = sql + "%(unit_price)s,%(total_price)s,%(follow)s,%(take_look)s,%(pub_date)s)"mysql.insert(sql, param)except Exception as e:print(e)# 主方法
def main():# 给请求指定一个请求头来模拟chrome浏览器headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}page_max = 100# 爬图地址for i in range(1, int(page_max) + 1):print("第几页:" + str(i))if i == 1:house = 'https://qd.lianjia.com/ershoufang/shibei/'else:house = 'https://qd.lianjia.com/ershoufang/shibei/pg'+str(i)res = requests.get(house, headers=headers)soup = BeautifulSoup(res.text, 'html.parser')li_max = soup.find('ul', class_='sellListContent').find_all('li')for li in li_max:try:house_param = {}# 格式 荣馨苑 | 3室2厅 | 115.91平米 | 南 北 | 毛坯 | 无电梯content = li.find('div', class_='houseInfo').textcontent = content.split("|")house_param['housing_estate'] = content[0]house_param['square_metre'] = re.findall(r'-?\d+\.?\d*e?-?\d*?', content[2])[0]# --------------------------------------------------------## 位置 水清沟position = li.find('div', class_='positionInfo').find('a').texthouse_param['position'] = position# --------------------------------------------------------#totalprice = li.find('div', class_='totalPrice').texthouse_param['total_price'] = re.sub("\D", "", totalprice)unitprice = li.find('div', class_='unitPrice').texthouse_param['unit_price'] = re.sub("\D", "", unitprice)# --------------------------------------------------------## 57人关注 / 共13次带看 / 6个月以前发布follow = li.find('div', class_='followInfo').textfollow = follow.split("/")house_param['follow'] = re.sub("\D", "", follow[0])house_param['take_look'] = re.sub("\D", "", follow[1])# --------------------------------------------------------## 二手房地址title_src = li.find('div', class_='title').find('a').attrs['href']house_param['url'] = re.sub("\D", "", title_src)res = requests.get(title_src, headers=headers)soup = BeautifulSoup(res.text, 'html.parser')# --------------------------------------------------------## 挂牌时间(重要数据)pub_date = soup.find('div', class_='transaction').find_all('li')[0].find_all('span')[1].texthouse_param['pub_date'] = pub_datewrite_db(house_param)except Exception as e:print(e)mysql.end("commit")mysql.dispose()if __name__ == '__main__':main()
#!/usr/bin/python3
# -*- coding:utf-8 -*-
import pymysql, os, configparser
from pymysql.cursors import DictCursor
from DBUtils.PooledDB import PooledDBclass Config(object):"""# Config().get_content("user_information")配置文件里面的参数[dbMysql]host = 192.168.1.80port = 3306user = rootpassword = 123456"""def __init__(self, config_filename="dbMysqlConfig.cnf"):file_path = os.path.join(os.path.dirname(__file__), config_filename)self.cf = configparser.ConfigParser()self.cf.read(file_path)def get_sections(self):return self.cf.sections()def get_options(self, section):return self.cf.options(section)def get_content(self, section):result = {}for option in self.get_options(section):value = self.cf.get(section, option)result[option] = int(value) if value.isdigit() else valuereturn resultclass BasePymysqlPool(object):def __init__(self, host, port, user, password, db_name):self.db_host = hostself.db_port = int(port)self.user = userself.password = str(password)self.db = db_nameself.conn = Noneself.cursor = Noneclass MyPymysqlPool(BasePymysqlPool):"""MYSQL数据库对象,负责产生数据库连接 , 此类中的连接采用连接池实现获取连接对象:conn = Mysql.getConn()释放连接对象;conn.close()或del conn"""# 连接池对象__pool = Nonedef __init__(self, conf_name=None):self.conf = Config().get_content(conf_name)super(MyPymysqlPool, self).__init__(**self.conf)# 数据库构造函数,从连接池中取出连接,并生成操作游标self._conn = self.__getConn()self._cursor = self._conn.cursor()def __getConn(self):"""@summary: 静态方法,从连接池中取出连接@return MySQLdb.connection"""if MyPymysqlPool.__pool is None:__pool = PooledDB(creator=pymysql,mincached=1,maxcached=20,host=self.db_host,port=self.db_port,user=self.user,passwd=self.password,db=self.db,use_unicode=True,charset="utf8",cursorclass=DictCursor)return __pool.connection()def getAll(self, sql, param=None):"""@summary: 执行查询,并取出所有结果集@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来@param param: 可选参数,条件列表值(元组/列表)@return: result list(字典对象)/boolean 查询到的结果集"""if param is None:count = self._cursor.execute(sql)else:count = self._cursor.execute(sql, param)if count > 0:result = self._cursor.fetchall()else:result = Falsereturn resultdef getOne(self, sql, param=None):"""@summary: 执行查询,并取出第一条@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来@param param: 可选参数,条件列表值(元组/列表)@return: result list/boolean 查询到的结果集"""if param is None:count = self._cursor.execute(sql)else:count = self._cursor.execute(sql, param)if count > 0:result = self._cursor.fetchone()else:result = Falsereturn resultdef getMany(self, sql, num, param=None):"""@summary: 执行查询,并取出num条结果@param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来@param num:取得的结果条数@param param: 可选参数,条件列表值(元组/列表)@return: result list/boolean 查询到的结果集"""if param is None:count = self._cursor.execute(sql)else:count = self._cursor.execute(sql, param)if count > 0:result = self._cursor.fetchmany(num)else:result = Falsereturn resultdef insertMany(self, sql, values):"""@summary: 向数据表插入多条记录@param sql:要插入的SQL格式@param values:要插入的记录数据tuple(tuple)/list[list]@return: count 受影响的行数"""count = self._cursor.executemany(sql, values)return countdef __query(self, sql, param=None):if param is None:count = self._cursor.execute(sql)else:count = self._cursor.execute(sql, param)return countdef update(self, sql, param=None):"""@summary: 更新数据表记录@param sql: SQL格式及条件,使用(%s,%s)@param param: 要更新的 值 tuple/list@return: count 受影响的行数"""return self.__query(sql, param)def insert(self, sql, param=None):"""@summary: 更新数据表记录@param sql: SQL格式及条件,使用(%s,%s)@param param: 要更新的 值 tuple/list@return: count 受影响的行数"""return self.__query(sql, param)def delete(self, sql, param=None):"""@summary: 删除数据表记录@param sql: SQL格式及条件,使用(%s,%s)@param param: 要删除的条件 值 tuple/list@return: count 受影响的行数"""return self.__query(sql, param)def begin(self):"""@summary: 开启事务"""self._conn.autocommit(0)def end(self, option='commit'):"""@summary: 结束事务"""if option == 'commit':self._conn.commit()else:self._conn.rollback()def dispose(self, isEnd=1):"""@summary: 释放连接池资源"""if isEnd == 1:self.end('commit')else:self.end('rollback')self._cursor.close()self._conn.close()mysql = MyPymysqlPool("dbMysql")if __name__ == '__main__':sqlAll = "select id, title from novel limit 2;"result = mysql.getAll(sqlAll)print(result)# 释放资源mysql.dispose()
[dbMysql]
host = localhost
port = 3306
user = root
password = 123456
db_name = house