python学习笔记爬虫——爬取链家网指定地区所有数据并保存到csv文件中

104 阅读 0 评论 69 点赞

我是靠谱客的博主大力芝麻，最近开发中收集的这篇文章主要介绍python学习笔记爬虫——爬取链家网指定地区所有数据并保存到csv文件中，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

下面是代码

import requests,csv,random,re,time
from lxml import etree
proxies={
'http':'http://113.128.28.170:9999',
'http':'http://47.107.93.105:8000',
'http':'http://123.163.122.189:9999',
'http':'http://182.61.109.24:8888',
'http':'http://123.54.44.220:9999',
'http':'http://116.208.102.146:61234',
'http':'http://122.4.49.199:9999',
'http':'http://47.107.93.105:8000',
'http':'http://123.163.122.189:9999',
'http':'http://182.61.109.24:8888',
'http':'http://123.54.44.220:9999',
'http':'http://116.208.102.146:61234',
'http':'http://122.4.49.199:9999',
'http':'http://218.27.84.219:80',
'http':'http://112.12.91.78:8888',
'http':'http://175.20.199.80:8080',
'http':'http://183.146.29.47:8888',
'http':'http://183.146.29.211:8888',
'http':'http://121.40.162.239:808',
'http':'http://123.139.56.238:9999',
'http':'http://112.12.91.34:8888',
'http':'http://121.10.139.191:3128',
'http':'http://112.12.91.26:8888',
'http':'http://117.127.16.206:8080',
'http':'http://183.146.29.215:8888',
'http':'http://27.203.209.148:8060',
'http':'http://183.146.29.29:8888',
'http':'http://121.227.80.115:8118',
'http':'http://115.208.18.139:61234',
'http':'http://183.6.183.35:3128',
'http':'http://117.28.96.23:9999',
'http':'http://121.40.64.214:80',
'http':'http://121.10.139.204:3128',
'http':'http://106.14.184.255:80',
'http':'http://112.12.91.210:8888',
'http':'http://220.133.218.213:60241',
'http':'http://122.4.41.161:9999',
'http':'http://1.194.118.51:9999',
'http':'http://101.132.193.192:8118',
'http':'http://112.12.91.43:8888',
'http':'http://221.6.201.18:9999',
'http':'http://118.78.196.7:8118',
'http':'http://180.160.54.117:8118',
'http':'http://112.85.129.171:9999',
'http':'http://39.137.69.6:80',
'http':'http://116.62.205.9:3128',
'http':'http://175.16.9.218:80',
'http':'http://121.236.73.101:61234',
'http':'http://171.11.32.17:9999',
'http':'http://42.238.81.226:9999',
'http':'http://39.137.69.7:8080',
'http':'http://183.146.29.220:8888',
'http':'http://183.146.29.216:8888',
'http':'http://125.111.137.36:8088',
'http':'http://183.146.29.34:8888',
'http':'http://42.229.189.66:8060',
'http':'http://183.146.29.240:8888',
'http':'http://112.12.91.74:8888',
'http':'http://1.198.72.196:9999',
'http':'http://112.85.129.183:9999',
'http':'http://123.54.224.95:9999',
'http':'http://182.116.229.53:9999',
'http':'http://1.198.73.16:9999',
'http':'http://61.128.208.94:3128',
'http':'http://112.87.69.179:9999',
'http':'http://113.128.25.245:61234',
'http':'http://1.198.72.207:9999',
'http':'http://123.54.47.81:9999',
'http':'http://171.11.29.187:9999',
'http':'http://171.12.113.153:9999',
'http':'http://112.12.91.241:8888',
'http':'http://183.146.29.208:8888',
'http':'http://112.12.91.209:8888',
'http':'http://118.31.64.170:3128',
'http':'http://113.110.47.37:61234',
'http':'http://39.135.24.11:80',
'http':'http://183.146.29.36:8888',
'http':'http://183.146.29.59:8888',
'http':'http://123.54.251.67:9999',
'http':'http://106.52.181.184:80',
'http':'http://110.86.139.23:9999',
'http':'http://218.60.8.99:3129',
}
user_agent = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]
headers={"User-Agent": random.choice(user_agent)}
class lianjia(object):
headers_list = ['名称', '地址','社区名称','楼层', '标签', '关注人数', '发布时间', '平方价格/万', '总价/元']
def __init__(self,url):
self.url=url
self.list=[]
def get_adders_many_html(self,url):
#获取一个大地区中的html
r=requests.get(url,headers=headers)
return r.text
def get_adders_html_url(self,html):
#获取一个大地区中的所有小地区的url和名字
xml=etree.HTML(html)
urls=xml.xpath('//div[@data-role="ershoufang"]/div/a/@href')
title=xml.xpath('//div[@data-role="ershoufang"]/div/a/text()')
return zip(urls,title)
def get_url_html(self,url):
#获取每一页的数据
try:
r=requests.get(url,headers=headers,proxies=proxies)
if r.status_code==200:
html=r.text
return html
else:
print(r.status_code)
except Exception as e:
print('错误信息:',e)
def save_csv_headers(self,title):
#首先写入csv文件头
with open('链家_{}.csv'.format(title),'a+',encoding='gbk',newline='')as f:
write=csv.DictWriter(f,self.headers_list)
write.writeheader()
#获取数据并保存数据
def get_url_data(self,html,title):
xml = etree.HTML(html)
title_list = xml.xpath('//ul[@log-mod="list"]/li/div[@class="info clear"]/div[@class="title"]/a/text()')
# 名字
tog_1_list = xml.xpath(
'//ul[@log-mod="list"]/li/div[@class="info clear"]/div[@class="address"]/div/text()')
# 标签1
tog_2_list = xml.xpath(
'//ul[@log-mod="list"]/li/div[@class="info clear"]/div[@class="address"]/div/a/text()')
# 标签2
floot_list = xml.xpath('//ul[@log-mod="list"]/li/div[@class="info clear"]/div[@class="flood"]/div/text()')
# 楼层
address_list = xml.xpath(
'//ul[@log-mod="list"]/li/div[@class="info clear"]/div[@class="flood"]/div/a/text()')
# 地址
peopre_list = xml.xpath(
'//ul[@log-mod="list"]/li/div[@class="info clear"]/div[@class="followInfo"]/text()')
# 人数
The_total_price_list = xml.xpath(
'//ul[@log-mod="list"]/li/div[@class="info clear"]/div[@class="priceInfo"]/div[1]/span/text()')
# 总价
square = xml.xpath(
'//ul[@log-mod="list"]/li/div[@class="info clear"]/div[@class="priceInfo"]/div[2]/span/text()')
# 平方价
for i in zip(title_list, tog_1_list, tog_2_list, floot_list, address_list, peopre_list, The_total_price_list,square):
data = {
'名称': i[0],
'社区名称':i[2],
'标签': i[1],
'楼层': i[3].replace(' ', '').replace('-', ''),
'地址': i[4],
'关注人数': re.search(r'(d+).*?', i[5].split('/')[0]).group(1),
'发布时间': re.search(r'(.*?)发布', i[5].split('/')[-1]).group(1),
'平方价格/万': i[6],
'总价/元': re.search(r'.*?(d+).*?', i[7]).group(1)
}
with open('链家_{}.csv'.format(title), 'a+', encoding='gbk', newline='')as f:
write = csv.DictWriter(f, self.headers_list)
write.writerow(data)
#主要运行程序
def run(self):
html=self.get_adders_many_html(self.url)
zip_list=self.get_adders_html_url(html)
for b in zip_list:
self.save_csv_headers(b[1])
print('当前爬取地址为:',b[1])
for i in range(0,100+1):
try:
urls='https://zz.lianjia.com{}pg{}/'.format(b[0],i)
html=self.get_url_html(urls)
print('当前爬取网页:',urls)
data=self.get_url_data(html,b[1])
time.sleep(random.randint(2,6))
except Exception:
pass
def main():
url='https://zz.lianjia.com/ershoufang/erqi/'
lianjia_data=lianjia(url)
lianjia_data.run()
if __name__ == '__main__':
main()
'''
遇到错误:io.UnsupportedOperation: not writable(写入文件时没有标注文件权限)
'''

注意在使用csv写入字典形式是,首先要写入表头,字典中的键要和表头中的值一一对应
还有就是在写入文件时我们要在open函数里面加一个newline=’'这样防止程序在写入文件时自动在末尾加一个换行符