前言:
提前安装好所需要的库。
本代码的输入仅需要某个城市的租房地址首页即可,其他自会生成。
使用前请创建所需的目录,或者为代码添加os.makedir()
支持断点重爬,重行运行即可。
headers等随运行环境不同,可能需要进行修改。
本代码使用了高德API key,用于获取地理坐标,但发布时已略去,如需使用,请注册高德api开发者。
内容原创,引用请注明出处。Note: http://www.cnblogs.com/shadrach; author: shadrach@yeah.net。
复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236# author: shadrach@yeah.net # blog: http://www.cnblogs.com/shadrach # NOTE: original article, indicate the source if reprint. # Thanks. # Update: 2018/1/24 import urllib.request from bs4 import BeautifulSoup import xlsxwriter import xlrd import os import math import time import glob # coordinate convert: from gcj(amap) to wgs(gps) def GCJ2WGS(location): # location格式如下:locations[1] = "113.923745,22.530824" lon = float(location[0:location.find(",")]) lat = float(location[location.find(",") + 1:len(location)]) a = 6378245.0 # 克拉索夫斯基椭球参数长半轴a ee = 0.00669342162296594323 #克拉索夫斯基椭球参数第一偏心率平方 PI = 3.14159265358979324 # 圆周率 # 以下为转换公式 x = lon - 105.0 y = lat - 35.0 # 经度 dLon = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x)) dLon += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0 dLon += (20.0 * math.sin(x * PI) + 40.0 * math.sin(x / 3.0 * PI)) * 2.0 / 3.0 dLon += (150.0 * math.sin(x / 12.0 * PI) + 300.0 * math.sin(x / 30.0 * PI)) * 2.0 / 3.0 #维度 dLat = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x)) dLat += (20.0 * math.sin(6.0 * x * PI) + 20.0 * math.sin(2.0 * x * PI)) * 2.0 / 3.0 dLat += (20.0 * math.sin(y * PI) + 40.0 * math.sin(y / 3.0 * PI)) * 2.0 / 3.0 dLat += (160.0 * math.sin(y / 12.0 * PI) + 320 * math.sin(y * PI / 30.0)) * 2.0 / 3.0 radLat = lat / 180.0 * PI magic = math.sin(radLat) magic = 1 - ee * magic * magic sqrtMagic = math.sqrt(magic) dLat = (dLat * 180.0) / ((a * (1 - ee)) / (magic * sqrtMagic) * PI) dLon = (dLon * 180.0) / (a / sqrtMagic * math.cos(radLat) * PI) wgsLon = lon - dLon wgsLat = lat - dLat return wgsLon,wgsLat # xlsx files merge def xlsx_merge(fileLocation,header,filename): fileList = [] for fileName in glob.glob(fileLocation + "*.xlsx"): fileList.append(fileName) fileNum = len(fileList) matrix = [None] * fileNum for i in range(fileNum): fileName = fileList[i] workBook = xlrd.open_workbook(fileName) try: sheet = workBook.sheet_by_index(0) except Exception as e: print(e) nRows = sheet.nrows matrix[i] = [0]*(nRows - 1) nCols = sheet.ncols for m in range(nRows - 1): matrix[i][m] = ["0"]* nCols for j in range(1,nRows): for k in range(nCols): matrix[i][j-1][k] = sheet.cell(j,k).value fileName = xlsxwriter.Workbook(fileLocation + filename + ".xlsx") sheet = fileName.add_worksheet("merged") for i in range(len(header)): sheet.write(0,i,header[i]) rowIndex = 1 for fileIndex in range(fileNum): for j in range(len(matrix[fileIndex])): for colIndex in range (len(matrix[fileIndex][j])): sheet.write(rowIndex,colIndex,matrix[fileIndex][j][colIndex]) rowIndex += 1 print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + ": "+ "已完成%d个文件的合并"%fileNum) fileName.close() # uniform request def soup_form(url,referer): headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", # "Accept-Encoding":"gzip, deflate, sdch, br", # 解码错误,注释 "Accept-Language":"zh-CN,zh;q=0.8", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Cookie":"lps=http%3A%2F%2Fwww.anjuke.com%2F%3Fpi%3DPZ-baidu-pc-all-biaoti%7Chttp%3A%2F%2Fbzclk.baidu.com%2Fadrc.php%3Ft%3D06KL00c00fDgzw60mUFU00PpAs0Mhyup00000PkqW-b00000uN71Vj.THvs_oeHEtY0UWdBmy-bIfK15yNBnHfkrjfLnj0sn1bdmWD0IHYLfbcsnYuKwj-7f1KKfHT4nj0sPYRvwj0dPDFanYFKfsK95gTqFhdWpyfqn103nWfLP1ndniusThqbpyfqnHm0uHdCIZwsT1CEQLILIz49UhGdpvR8mvqVQ1qspHdfyBdBmy-bIidsmzd9UAsVmh-9ULwG0APzm1YkrH6dP0%26tpl%3Dtpl_10085_16624_12226%26l%3D1502510556%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E5%2525AE%252589%2525E5%2525B1%252585%2525E5%2525AE%2525A2-%2525E5%25259B%2525BD%2525E5%252586%252585%2525E9%2525A2%252586%2525E5%252585%252588%2525E6%252589%2525BE%2525E6%252588%2525BF%2525E5%2525B9%2525B3%2525E5%25258F%2525B0%2525EF%2525BC%25258C%2525E5%2525AE%252589%2525E5%2525BF%252583%2525E6%25258C%252591%2526xp%253Did%28%252522m4ce5ae35%252522%29%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D54%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26oq%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26rqlang%3Dcn; sessid=CE9A95AF-043B-90B5-A2E4-5F5D39B41EC4; als=0; ctid=41; ANJUKE_BUCKET=pc-home%3AErshou_Web_Home_Home-a; _ga=GA1.2.113488767.1516673325; _gid=GA1.2.255451285.1516673325; __xsptplusUT_8=1; __xsptplus8=8.2.1516678573.1516678593.4%232%7Cbzclk.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%23249u729XL4J3ZAGKQyEZUyuV4myBLtSZ%23; 58tj_uuid=8a65130f-1085-403a-9e02-5c07dba15641; new_session=0; init_refer=https%253A%252F%252Fnc.zu.anjuke.com%252F%253Ffrom%253Dnavigation; new_uv=2; aQQ_ajkguid=BC9AF129-431B-1C4F-BB91-A27203DE8341; twe=2; Hm_lvt_ed38609fc79dd16e428d5a06610cfeb9=1516673382; Hm_lpvt_ed38609fc79dd16e428d5a06610cfeb9=1516678594", "Referer":referer, "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" } request = urllib.request.Request(url = url, headers = headers) soup = BeautifulSoup(urllib.request.urlopen(request,timeout=60).read().decode("utf-8"),"lxml") return soup header = ["名称","房型","面积","层数","中介人","小区","高德地址","高德坐标","wgs坐标","wgs经度","wgs纬度","路段","地址","特点一","特点二","特点三","价格","房源链接"] # Step1: get and save or read level_1 and level_2 links links_file = "E:/20180123安居客_南昌租房/links/links.xlsx" if os.path.exists(links_file): workbook_links = xlrd.open_workbook(links_file) sheet_links = workbook_links.sheet_by_index(0) level2_link = sheet_links.col_values(0) level2 = sheet_links.col_values(1) else: # get sub_level1 url_level_0 = "https://nc.zu.anjuke.com/fangyuan/p1/" # 这个地址是需要根据你所爬取的城市进行修改的 level1_link =[] level1 = [] for a in soup_form(url_level_0,"https://nc.zu.anjuke.com/").find("div", class_ = "sub-items sub-level1").find_all("a"): level1_link.append(a.get("href")) level1.append(a.text) # get sub_level2 level2_link =[] level2 = [] for i in range(1,len(level1_link)): for a in soup_form(level1_link[i],level1_link[i-1]).find("div", class_ = "sub-items sub-level2").find_all("a"): if a.text == "全部": pass else: level2_link.append(a.get("href")) level2.append(a.text) workbook_links = xlsxwriter.Workbook(links_file) sheet_links = workbook_links.add_worksheet("level_2") sheet_links.write_column(0, 0, level2_link) sheet_links.write_column(0, 1, level2) workbook_links.close() print("Step 1 Done!nStep 2 Start!") # Step 2: get every level 2 links' rent information for j in range(len(level2_link)): # at every level 2 page page_index = 1 for k in range(1,51): # max loop # at every page, get the max page, and compare to the current page. if more than current page, continue rent_info_file ="E:/20180123安居客_南昌租房/split_data/" + level2[j] + "_info_page" + str(page_index) + ".xlsx" if os.path.exists(rent_info_file): page_index += 1 print(level2[j] + "_info_page" + str(page_index) + ".xlsx already exits") else: pages = [] url = level2_link[j] + "p" + str(page_index) soup = soup_form(url, level2_link[j]) try: for a in soup.find("div",class_ = "multi-page").find_all("a"): if a.text == "下一页 >" or a.text == "上一页": pass else: pages.append(int(a.text)) except Exception: break try: max_page = pages[len(pages)-1] except Exception: max_page = 1 if page_index < max_page + 2: workbook_page = xlsxwriter.Workbook(rent_info_file) sheet = workbook_page.add_worksheet("page" + str(page_index)) for header_index in range(len(header)): sheet.write(0,header_index,header[header_index]) row_index = 1 for div in soup.find_all("div", class_ = "zu-itemmod"): try: sheet.write(row_index,0,div.find("a").get("title")) # 名称 except Exception: pass try: sheet.write(row_index,1,div.find("p").text.split("")[0].split("|")[0].replace(" ","").replace("n","")) # 房型 except Exception: pass try: sheet.write(row_index,2,div.find("p").text.split("")[0].split("|")[1]) # 面积 except Exception: pass try: sheet.write(row_index,3,div.find("p").text.split("")[0].split("|")[2]) # 层数 except Exception: pass try: sheet.write(row_index,4,div.find("p").text.split("")[1]) # 中介人 except Exception: pass try: xiaoqu = div.find("address").text.split()[0] sheet.write(row_index,5,xiaoqu) # 小区 url_amap = "http://restapi.amap.com/v3/geocode/geo?address=" + urllib.parse.quote(xiaoqu) + "&output=xml&city=0791&key=【你的key】" soup_amap = BeautifulSoup(urllib.request.urlopen(url_amap).read(),"xml") sheet.write(row_index,6,soup_amap.find("formatted_address").get_text()) # 高德地址 location_amap = soup_amap.find("location").get_text() sheet.write(row_index,7,location_amap) # 高德坐标 location_wgs = GCJ2WGS(location_amap) longitude = location_wgs[0] latitude = location_wgs[1] sheet.write(row_index,8,str(longitude) + "," + str(latitude)) # wgs坐标 sheet.write(row_index,9,longitude)# wgs经度 sheet.write(row_index,10,latitude) # wgs纬度 except Exception: pass try: sheet.write(row_index,11,div.find("address").text.split()[1]) # 路段 except Exception: pass try: sheet.write(row_index,12,div.find("address").text.split()[2]) # 地址 except Exception: pass try: sheet.write(row_index,13,div.find("span",class_ = "cls-1").text) except Exception: pass try: sheet.write(row_index,14,div.find("span",class_ = "cls-2").text) except Exception: pass try: sheet.write(row_index,15,div.find("span",class_ = "cls-3").text) except Exception: pass try: sheet.write(row_index,16,div.find("strong").text) # 价格 except Exception: pass try: sheet.write(row_index,17,div.find("a").get("href")) # 房源链接 except Exception: pass row_index += 1 workbook_page.close() print(level2[j] + " page" + str(page_index) + " finished") page_index += 1 else: break print(level2[j] + "finished") print("Step 2 Done!nStep 3 Start!") # Step 3: merge all xlsx files xlsx_merge("E:/20180123安居客_南昌租房/split_data/", header, "nanchang_rent_info") print("All work done")
最后
以上就是可爱钢笔最近收集整理的关于Python——安居客租房信息爬取(以南昌为例)的全部内容,更多相关Python——安居客租房信息爬取(以南昌为例)内容请搜索靠谱客的其他文章。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复