Python——安居客租房信息爬取（以南昌为例）

90 阅读 0 评论 60 点赞

我是靠谱客的博主可爱钢笔，这篇文章主要介绍Python——安居客租房信息爬取（以南昌为例），现在分享给大家，希望可以做个参考。

前言：

提前安装好所需要的库。
本代码的输入仅需要某个城市的租房地址首页即可，其他自会生成。
使用前请创建所需的目录，或者为代码添加os.makedir()
支持断点重爬，重行运行即可。
headers等随运行环境不同，可能需要进行修改。
本代码使用了高德API key，用于获取地理坐标，但发布时已略去，如需使用，请注册高德api开发者。
内容原创，引用请注明出处。Note: http://www.cnblogs.com/shadrach; author: shadrach@yeah.net。

复制代码

# author: shadrach@yeah.net
# blog: http://www.cnblogs.com/shadrach
# NOTE: original article, indicate the source if reprint. 
# Thanks.
# Update: 2018/1/24
import urllib.request
from bs4 import BeautifulSoup
import xlsxwriter
import xlrd
import os
import math
import time
import glob
# coordinate convert: from gcj(amap) to wgs(gps)
def GCJ2WGS(location):
# location格式如下：locations[1] = "113.923745,22.530824"
lon = float(location[0:location.find(",")])
lat = float(location[location.find(",") + 1:len(location)])
a = 6378245.0 # 克拉索夫斯基椭球参数长半轴a
ee = 0.00669342162296594323 #克拉索夫斯基椭球参数第一偏心率平方
PI = 3.14159265358979324 # 圆周率
# 以下为转换公式
x = lon - 105.0
y = lat - 35.0
# 经度
dLon = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x))
dLon += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0
dLon += (20.0 * math.sin(x * PI) + 40.0 * math.sin(x / 3.0 * PI)) * 2.0 / 3.0
dLon += (150.0 * math.sin(x / 12.0 * PI) + 300.0 * math.sin(x / 30.0 * PI)) * 2.0 / 3.0
#维度
dLat = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x))
dLat += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0
dLat += (20.0 * math.sin(y * PI) + 40.0 * math.sin(y / 3.0 * PI)) * 2.0 / 3.0
dLat += (160.0 * math.sin(y / 12.0 * PI) + 320 * math.sin(y * PI / 30.0)) * 2.0 / 3.0
radLat = lat / 180.0 * PI
magic = math.sin(radLat)
magic = 1 - ee * magic * magic
sqrtMagic = math.sqrt(magic)
dLat = (dLat * 180.0) / ((a * (1 - ee)) / (magic * sqrtMagic) * PI)
dLon = (dLon * 180.0) / (a / sqrtMagic * math.cos(radLat) * PI)
wgsLon = lon - dLon
wgsLat = lat - dLat
return wgsLon,wgsLat
# xlsx files merge
def xlsx_merge(fileLocation,header,filename):
fileList = []
for fileName in glob.glob(fileLocation + "*.xlsx"):
fileList.append(fileName)
fileNum = len(fileList)
matrix = [None] * fileNum
for i in range(fileNum):
fileName = fileList[i]
workBook = xlrd.open_workbook(fileName)
try:
sheet = workBook.sheet_by_index(0)
except Exception as e:
print(e)
nRows = sheet.nrows
matrix[i] = [0]*(nRows - 1)
nCols = sheet.ncols
for m in range(nRows - 1):
matrix[i][m] = ["0"]* nCols
for j in range(1,nRows):
for k in range(nCols):
matrix[i][j-1][k] = sheet.cell(j,k).value
fileName = xlsxwriter.Workbook(fileLocation + filename + ".xlsx")
sheet = fileName.add_worksheet("merged")
for i in range(len(header)):
sheet.write(0,i,header[i])
rowIndex = 1
for fileIndex in range(fileNum):
for j in range(len(matrix[fileIndex])):
for colIndex in range (len(matrix[fileIndex][j])):
sheet.write(rowIndex,colIndex,matrix[fileIndex][j][colIndex])
rowIndex += 1
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ": "+ "已完成%d个文件的合并"%fileNum)
fileName.close()
# uniform request
def soup_form(url,referer):
headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#
"Accept-Encoding":"gzip, deflate, sdch, br",
# 解码错误，注释
"Accept-Language":"zh-CN,zh;q=0.8",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"Cookie":"lps=http%3A%2F%2Fwww.anjuke.com%2F%3Fpi%3DPZ-baidu-pc-all-biaoti%7Chttp%3A%2F%2Fbzclk.baidu.com%2Fadrc.php%3Ft%3D06KL00c00fDgzw60mUFU00PpAs0Mhyup00000PkqW-b00000uN71Vj.THvs_oeHEtY0UWdBmy-bIfK15yNBnHfkrjfLnj0sn1bdmWD0IHYLfbcsnYuKwj-7f1KKfHT4nj0sPYRvwj0dPDFanYFKfsK95gTqFhdWpyfqn103nWfLP1ndniusThqbpyfqnHm0uHdCIZwsT1CEQLILIz49UhGdpvR8mvqVQ1qspHdfyBdBmy-bIidsmzd9UAsVmh-9ULwG0APzm1YkrH6dP0%26tpl%3Dtpl_10085_16624_12226%26l%3D1502510556%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E5%2525AE%252589%2525E5%2525B1%252585%2525E5%2525AE%2525A2-%2525E5%25259B%2525BD%2525E5%252586%252585%2525E9%2525A2%252586%2525E5%252585%252588%2525E6%252589%2525BE%2525E6%252588%2525BF%2525E5%2525B9%2525B3%2525E5%25258F%2525B0%2525EF%2525BC%25258C%2525E5%2525AE%252589%2525E5%2525BF%252583%2525E6%25258C%252591%2526xp%253Did%28%252522m4ce5ae35%252522%29%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D54%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26oq%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26rqlang%3Dcn; sessid=CE9A95AF-043B-90B5-A2E4-5F5D39B41EC4; als=0; ctid=41; ANJUKE_BUCKET=pc-home%3AErshou_Web_Home_Home-a; _ga=GA1.2.113488767.1516673325; _gid=GA1.2.255451285.1516673325; __xsptplusUT_8=1; __xsptplus8=8.2.1516678573.1516678593.4%232%7Cbzclk.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%23249u729XL4J3ZAGKQyEZUyuV4myBLtSZ%23; 58tj_uuid=8a65130f-1085-403a-9e02-5c07dba15641; new_session=0; init_refer=https%253A%252F%252Fnc.zu.anjuke.com%252F%253Ffrom%253Dnavigation; new_uv=2; aQQ_ajkguid=BC9AF129-431B-1C4F-BB91-A27203DE8341; twe=2; Hm_lvt_ed38609fc79dd16e428d5a06610cfeb9=1516673382; Hm_lpvt_ed38609fc79dd16e428d5a06610cfeb9=1516678594",
"Referer":referer,
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
request = urllib.request.Request(url = url, headers = headers)
soup = BeautifulSoup(urllib.request.urlopen(request,timeout=60).read().decode("utf-8"),"lxml")
return soup
header = ["名称","房型","面积","层数","中介人","小区","高德地址","高德坐标","wgs坐标","wgs经度","wgs纬度","路段","地址","特点一","特点二","特点三","价格","房源链接"]
# Step1: get and save or read level_1 and level_2 links
links_file = "E:/20180123安居客_南昌租房/links/links.xlsx"
if os.path.exists(links_file):
workbook_links = xlrd.open_workbook(links_file)
sheet_links = workbook_links.sheet_by_index(0)
level2_link = sheet_links.col_values(0)
level2 = sheet_links.col_values(1)
else:
# get sub_level1
url_level_0 = "https://nc.zu.anjuke.com/fangyuan/p1/" # 这个地址是需要根据你所爬取的城市进行修改的
level1_link =[]
level1 = []
for a in soup_form(url_level_0,"https://nc.zu.anjuke.com/").find("div", class_ = "sub-items sub-level1").find_all("a"):
level1_link.append(a.get("href"))
level1.append(a.text)
# get sub_level2
level2_link =[]
level2 = []
for i in range(1,len(level1_link)):
for a in soup_form(level1_link[i],level1_link[i-1]).find("div", class_ = "sub-items sub-level2").find_all("a"):
if a.text == "全部":
pass
else:
level2_link.append(a.get("href"))
level2.append(a.text)
workbook_links = xlsxwriter.Workbook(links_file)
sheet_links = workbook_links.add_worksheet("level_2")
sheet_links.write_column(0, 0, level2_link)
sheet_links.write_column(0, 1, level2)
workbook_links.close()
print("Step 1 Done!nStep 2 Start!")
# Step 2: get every level 2 links' rent information
for j in range(len(level2_link)): # at every level 2 page
page_index = 1
for k in range(1,51): # max loop
# at every page, get the max page, and compare to the current page. if more than current page, continue
rent_info_file ="E:/20180123安居客_南昌租房/split_data/" + level2[j] + "_info_page" + str(page_index) + ".xlsx"
if os.path.exists(rent_info_file):
page_index += 1
print(level2[j] + "_info_page" + str(page_index) + ".xlsx already exits")
else:
pages = []
url = level2_link[j] + "p" + str(page_index)
soup = soup_form(url, level2_link[j])
try:
for a in soup.find("div",class_ = "multi-page").find_all("a"):
if a.text == "下一页 >" or a.text == "上一页":
pass
else:
pages.append(int(a.text))
except Exception:
break
try:
max_page = pages[len(pages)-1]
except Exception:
max_page = 1
if page_index < max_page + 2:
workbook_page = xlsxwriter.Workbook(rent_info_file)
sheet = workbook_page.add_worksheet("page" + str(page_index))
for header_index in range(len(header)):
sheet.write(0,header_index,header[header_index])
row_index = 1
for div in soup.find_all("div", class_ = "zu-itemmod"):
try:
sheet.write(row_index,0,div.find("a").get("title")) # 名称

except Exception:
pass
try:
sheet.write(row_index,1,div.find("p").text.split("")[0].split("|")[0].replace(" ","").replace("n","")) # 房型
except Exception:
pass
try:
sheet.write(row_index,2,div.find("p").text.split("")[0].split("|")[1]) # 面积
except Exception:
pass
try:
sheet.write(row_index,3,div.find("p").text.split("")[0].split("|")[2]) # 层数
except Exception:
pass
try:
sheet.write(row_index,4,div.find("p").text.split("")[1]) # 中介人
except Exception:
pass
try:
xiaoqu = div.find("address").text.split()[0]
sheet.write(row_index,5,xiaoqu) # 小区
url_amap = "http://restapi.amap.com/v3/geocode/geo?address=" + urllib.parse.quote(xiaoqu) + "&output=xml&city=0791&key=【你的key】"
soup_amap = BeautifulSoup(urllib.request.urlopen(url_amap).read(),"xml")
sheet.write(row_index,6,soup_amap.find("formatted_address").get_text()) # 高德地址
location_amap = soup_amap.find("location").get_text()
sheet.write(row_index,7,location_amap) # 高德坐标
location_wgs = GCJ2WGS(location_amap)
longitude = location_wgs[0]
latitude = location_wgs[1]
sheet.write(row_index,8,str(longitude) + "," + str(latitude)) # wgs坐标
sheet.write(row_index,9,longitude)# wgs经度
sheet.write(row_index,10,latitude) # wgs纬度 
except Exception:
pass
try:
sheet.write(row_index,11,div.find("address").text.split()[1]) # 路段
except Exception:
pass
try:
sheet.write(row_index,12,div.find("address").text.split()[2]) # 地址
except Exception:
pass
try:
sheet.write(row_index,13,div.find("span",class_ = "cls-1").text)
except Exception:
pass
try:
sheet.write(row_index,14,div.find("span",class_ = "cls-2").text)
except Exception:
pass
try:
sheet.write(row_index,15,div.find("span",class_ = "cls-3").text)
except Exception:
pass
try:
sheet.write(row_index,16,div.find("strong").text) # 价格
except Exception:
pass
try:
sheet.write(row_index,17,div.find("a").get("href")) # 房源链接
except Exception:
pass
row_index += 1
workbook_page.close()
print(level2[j] + " page" + str(page_index) + " finished")
page_index += 1
else:
break
print(level2[j] + "finished")
print("Step 2 Done!nStep 3 Start!")
# Step 3: merge all xlsx files
xlsx_merge("E:/20180123安居客_南昌租房/split_data/", header, "nanchang_rent_info")
print("All work done")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# author: shadrach@yeah.net
# blog: http://www.cnblogs.com/shadrach
# NOTE: original article, indicate the source if reprint. 
# Thanks.
# Update: 2018/1/24
import urllib.request
from bs4 import BeautifulSoup
import xlsxwriter
import xlrd
import os
import math
import time
import glob
# coordinate convert: from gcj(amap) to wgs(gps)
def GCJ2WGS(location):
# location格式如下：locations[1] = "113.923745,22.530824"
lon = float(location[0:location.find(",")])
lat = float(location[location.find(",") + 1:len(location)])
a = 6378245.0 # 克拉索夫斯基椭球参数长半轴a
ee = 0.00669342162296594323 #克拉索夫斯基椭球参数第一偏心率平方
PI = 3.14159265358979324 # 圆周率
# 以下为转换公式
x = lon - 105.0
y = lat - 35.0
# 经度
dLon = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x))
dLon += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0
dLon += (20.0 * math.sin(x * PI) + 40.0 * math.sin(x / 3.0 * PI)) * 2.0 / 3.0
dLon += (150.0 * math.sin(x / 12.0 * PI) + 300.0 * math.sin(x / 30.0 * PI)) * 2.0 / 3.0
#维度
dLat = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x))
dLat += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0
dLat += (20.0 * math.sin(y * PI) + 40.0 * math.sin(y / 3.0 * PI)) * 2.0 / 3.0
dLat += (160.0 * math.sin(y / 12.0 * PI) + 320 * math.sin(y * PI / 30.0)) * 2.0 / 3.0
radLat = lat / 180.0 * PI
magic = math.sin(radLat)
magic = 1 - ee * magic * magic
sqrtMagic = math.sqrt(magic)
dLat = (dLat * 180.0) / ((a * (1 - ee)) / (magic * sqrtMagic) * PI)
dLon = (dLon * 180.0) / (a / sqrtMagic * math.cos(radLat) * PI)
wgsLon = lon - dLon
wgsLat = lat - dLat
return wgsLon,wgsLat
# xlsx files merge
def xlsx_merge(fileLocation,header,filename):
fileList = []
for fileName in glob.glob(fileLocation + "*.xlsx"):
fileList.append(fileName)
fileNum = len(fileList)
matrix = [None] * fileNum
for i in range(fileNum):
fileName = fileList[i]
workBook = xlrd.open_workbook(fileName)
try:
sheet = workBook.sheet_by_index(0)
except Exception as e:
print(e)
nRows = sheet.nrows
matrix[i] = [0]*(nRows - 1)
nCols = sheet.ncols
for m in range(nRows - 1):
matrix[i][m] = ["0"]* nCols
for j in range(1,nRows):
for k in range(nCols):
matrix[i][j-1][k] = sheet.cell(j,k).value
fileName = xlsxwriter.Workbook(fileLocation + filename + ".xlsx")
sheet = fileName.add_worksheet("merged")
for i in range(len(header)):
sheet.write(0,i,header[i])
rowIndex = 1
for fileIndex in range(fileNum):
for j in range(len(matrix[fileIndex])):
for colIndex in range (len(matrix[fileIndex][j])):
sheet.write(rowIndex,colIndex,matrix[fileIndex][j][colIndex])
rowIndex += 1
print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ": "+ "已完成%d个文件的合并"%fileNum)
fileName.close()
# uniform request
def soup_form(url,referer):
headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#
"Accept-Encoding":"gzip, deflate, sdch, br",
# 解码错误，注释
"Accept-Language":"zh-CN,zh;q=0.8",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"Cookie":"lps=http%3A%2F%2Fwww.anjuke.com%2F%3Fpi%3DPZ-baidu-pc-all-biaoti%7Chttp%3A%2F%2Fbzclk.baidu.com%2Fadrc.php%3Ft%3D06KL00c00fDgzw60mUFU00PpAs0Mhyup00000PkqW-b00000uN71Vj.THvs_oeHEtY0UWdBmy-bIfK15yNBnHfkrjfLnj0sn1bdmWD0IHYLfbcsnYuKwj-7f1KKfHT4nj0sPYRvwj0dPDFanYFKfsK95gTqFhdWpyfqn103nWfLP1ndniusThqbpyfqnHm0uHdCIZwsT1CEQLILIz49UhGdpvR8mvqVQ1qspHdfyBdBmy-bIidsmzd9UAsVmh-9ULwG0APzm1YkrH6dP0%26tpl%3Dtpl_10085_16624_12226%26l%3D1502510556%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E5%2525AE%252589%2525E5%2525B1%252585%2525E5%2525AE%2525A2-%2525E5%25259B%2525BD%2525E5%252586%252585%2525E9%2525A2%252586%2525E5%252585%252588%2525E6%252589%2525BE%2525E6%252588%2525BF%2525E5%2525B9%2525B3%2525E5%25258F%2525B0%2525EF%2525BC%25258C%2525E5%2525AE%252589%2525E5%2525BF%252583%2525E6%25258C%252591%2526xp%253Did%28%252522m4ce5ae35%252522%29%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D54%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26oq%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26rqlang%3Dcn; sessid=CE9A95AF-043B-90B5-A2E4-5F5D39B41EC4; als=0; ctid=41; ANJUKE_BUCKET=pc-home%3AErshou_Web_Home_Home-a; _ga=GA1.2.113488767.1516673325; _gid=GA1.2.255451285.1516673325; __xsptplusUT_8=1; __xsptplus8=8.2.1516678573.1516678593.4%232%7Cbzclk.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%23249u729XL4J3ZAGKQyEZUyuV4myBLtSZ%23; 58tj_uuid=8a65130f-1085-403a-9e02-5c07dba15641; new_session=0; init_refer=https%253A%252F%252Fnc.zu.anjuke.com%252F%253Ffrom%253Dnavigation; new_uv=2; aQQ_ajkguid=BC9AF129-431B-1C4F-BB91-A27203DE8341; twe=2; Hm_lvt_ed38609fc79dd16e428d5a06610cfeb9=1516673382; Hm_lpvt_ed38609fc79dd16e428d5a06610cfeb9=1516678594",
"Referer":referer,
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
request = urllib.request.Request(url = url, headers = headers)
soup = BeautifulSoup(urllib.request.urlopen(request,timeout=60).read().decode("utf-8"),"lxml")
return soup
header = ["名称","房型","面积","层数","中介人","小区","高德地址","高德坐标","wgs坐标","wgs经度","wgs纬度","路段","地址","特点一","特点二","特点三","价格","房源链接"]
# Step1: get and save or read level_1 and level_2 links
links_file = "E:/20180123安居客_南昌租房/links/links.xlsx"
if os.path.exists(links_file):
workbook_links = xlrd.open_workbook(links_file)
sheet_links = workbook_links.sheet_by_index(0)
level2_link = sheet_links.col_values(0)
level2 = sheet_links.col_values(1)
else:
# get sub_level1
url_level_0 = "https://nc.zu.anjuke.com/fangyuan/p1/" # 这个地址是需要根据你所爬取的城市进行修改的
level1_link =[]
level1 = []
for a in soup_form(url_level_0,"https://nc.zu.anjuke.com/").find("div", class_ = "sub-items sub-level1").find_all("a"):
level1_link.append(a.get("href"))
level1.append(a.text)
# get sub_level2
level2_link =[]
level2 = []
for i in range(1,len(level1_link)):
for a in soup_form(level1_link[i],level1_link[i-1]).find("div", class_ = "sub-items sub-level2").find_all("a"):
if a.text == "全部":
pass
else:
level2_link.append(a.get("href"))
level2.append(a.text)
workbook_links = xlsxwriter.Workbook(links_file)
sheet_links = workbook_links.add_worksheet("level_2")
sheet_links.write_column(0, 0, level2_link)
sheet_links.write_column(0, 1, level2)
workbook_links.close()
print("Step 1 Done!nStep 2 Start!")
# Step 2: get every level 2 links' rent information
for j in range(len(level2_link)): # at every level 2 page
page_index = 1
for k in range(1,51): # max loop
# at every page, get the max page, and compare to the current page. if more than current page, continue
rent_info_file ="E:/20180123安居客_南昌租房/split_data/" + level2[j] + "_info_page" + str(page_index) + ".xlsx"
if os.path.exists(rent_info_file):
page_index += 1
print(level2[j] + "_info_page" + str(page_index) + ".xlsx already exits")
else:
pages = []
url = level2_link[j] + "p" + str(page_index)
soup = soup_form(url, level2_link[j])
try:
for a in soup.find("div",class_ = "multi-page").find_all("a"):
if a.text == "下一页 >" or a.text == "上一页":
pass
else:
pages.append(int(a.text))
except Exception:
break
try:
max_page = pages[len(pages)-1]
except Exception:
max_page = 1
if page_index < max_page + 2:
workbook_page = xlsxwriter.Workbook(rent_info_file)
sheet = workbook_page.add_worksheet("page" + str(page_index))
for header_index in range(len(header)):
sheet.write(0,header_index,header[header_index])
row_index = 1
for div in soup.find_all("div", class_ = "zu-itemmod"):
try:
sheet.write(row_index,0,div.find("a").get("title")) # 名称

except Exception:
pass
try:
sheet.write(row_index,1,div.find("p").text.split("")[0].split("|")[0].replace(" ","").replace("n","")) # 房型
except Exception:
pass
try:
sheet.write(row_index,2,div.find("p").text.split("")[0].split("|")[1]) # 面积
except Exception:
pass
try:
sheet.write(row_index,3,div.find("p").text.split("")[0].split("|")[2]) # 层数
except Exception:
pass
try:
sheet.write(row_index,4,div.find("p").text.split("")[1]) # 中介人
except Exception:
pass
try:
xiaoqu = div.find("address").text.split()[0]
sheet.write(row_index,5,xiaoqu) # 小区
url_amap = "http://restapi.amap.com/v3/geocode/geo?address=" + urllib.parse.quote(xiaoqu) + "&output=xml&city=0791&key=【你的key】"
soup_amap = BeautifulSoup(urllib.request.urlopen(url_amap).read(),"xml")
sheet.write(row_index,6,soup_amap.find("formatted_address").get_text()) # 高德地址
location_amap = soup_amap.find("location").get_text()
sheet.write(row_index,7,location_amap) # 高德坐标
location_wgs = GCJ2WGS(location_amap)
longitude = location_wgs[0]
latitude = location_wgs[1]
sheet.write(row_index,8,str(longitude) + "," + str(latitude)) # wgs坐标
sheet.write(row_index,9,longitude)# wgs经度
sheet.write(row_index,10,latitude) # wgs纬度 
except Exception:
pass
try:
sheet.write(row_index,11,div.find("address").text.split()[1]) # 路段
except Exception:
pass
try:
sheet.write(row_index,12,div.find("address").text.split()[2]) # 地址
except Exception:
pass
try:
sheet.write(row_index,13,div.find("span",class_ = "cls-1").text)
except Exception:
pass
try:
sheet.write(row_index,14,div.find("span",class_ = "cls-2").text)
except Exception:
pass
try:
sheet.write(row_index,15,div.find("span",class_ = "cls-3").text)
except Exception:
pass
try:
sheet.write(row_index,16,div.find("strong").text) # 价格
except Exception:
pass
try:
sheet.write(row_index,17,div.find("a").get("href")) # 房源链接
except Exception:
pass
row_index += 1
workbook_page.close()
print(level2[j] + " page" + str(page_index) + " finished")
page_index += 1
else:
break
print(level2[j] + "finished")
print("Step 2 Done!nStep 3 Start!")
# Step 3: merge all xlsx files
xlsx_merge("E:/20180123安居客_南昌租房/split_data/", header, "nanchang_rent_info")
print("All work done")