前言
本文使用Python爬取豆瓣电影Top250,含完整源代码,并实现两种方式保存数据(excel和数据库。
复制代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164from bs4 import BeautifulSoup #页面解析,获取数据 import re #正则表达式 import urllib.request,urllib.error #指定URL,获取页面数据 import xlwt #进行excel操作 import sqlite3 #进行sql操作 def main(): baseUrl = "https://movie.douban.com/top250?start=" #1.爬取网页,并解析数据 dataList = getData(baseUrl) savePath=".\豆瓣电影Top250.xls" #savePath = "movies.db" #2.保存数据 saveData(dataList,savePath) #savedb(dataList,savePath) #---正则表达式--- #链接 findLink = re.compile(r'<a href="(.*?)">',re.S) #电影名字 findName = re.compile(r'<span class="title">(.*?)</span>',re.S) #评分 findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') #标题 findInq = re.compile(r'<span class="inq">(.*?)</span>',re.S) #评分人数 findCount = re.compile(r'<span>(.*?)人评价</span>') #电影信息 findInf = re.compile(r'<p class="">(.*?)</p>',re.S) #1.爬取网页 def getData(baseUrl): dataList = [] for i in range(10): html = askUrl(baseUrl + str(i * 25)) # 2.逐一解析数据 bs = BeautifulSoup(html,"html.parser") for item in bs.find_all('div',class_="item"): data = [] item = str(item) #链接 link = re.findall(findLink,item)[0] #名字 name = re.findall(findName,item) if len(name) == 1: cName = name[0] fName = " " else: name[1] = name[1].replace(" / ","") cName = name[0] fName = name[1] #评分 rating = re.findall(findRating,item)[0] #标题 inq = re.findall(findInq,item) if len(inq) < 1: inq = " " else: inq= inq[0] #评分人数 racount = re.findall(findCount,item)[0] #电影信息 inf = re.findall(findInf,item)[0] inf = re.sub("...<br(s+)?/>(s?)"," ",inf) inf = re.sub("/"," ",inf) inf = inf.strip() #添加一部电影的信息进data data.append(link) data.append(cName) data.append(fName) data.append(rating) data.append(inq) data.append(racount) data.append(inf) dataList.append(data) return dataList #爬取指定url def askUrl(url): head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3776.400 QQBrowser/10.6.4212.400"} request = urllib.request.Request(url = url,headers=head) http = "" try: response = urllib.request.urlopen(request) http = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reson"): print(e.reson) return http # 3.保存数据 def saveData(dataList,savePath): woke = xlwt.Workbook("utf-8",style_compression=0)#样式的压缩效果 sheet = woke.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)#覆盖原单元格信息 col = ("链接","中文名字","英文名字","评分","标题","评分人数","概况") for i in range (7): sheet.write(0,i,col[i]) for i in range (0,250): for j in range (7): sheet.write(i+1,j,dataList[i][j]) print("第%d条数据"%(i+1)) woke.save(savePath) print("保存完毕") #3.保存到数据库 def savedb(dataList,dataPath): initdb(dataPath) conn = sqlite3.connect(dataPath) cur = conn.cursor() #开始保存数据 for data in dataList: for index in range(len(data)): data[index] = str('"'+data[index]+'"') newstr = ",".join(data) sql ="insert into movie(info_link,cname,fname,rating,inq,racount,inf)values(%s)"%(newstr) print(sql) cur.execute(sql) conn.commit() cur.close() conn.close() print("保存完毕") #3-1新建表 def initdb(dataPath): conn = sqlite3.connect(dataPath) cur = conn.cursor() sql = ''' create table movie( id Integer primary key autoincrement, info_link text, cname varchar , fname varchar , rating varchar , inq text, racount varchar , inf text ) ''' cur.execute(sql) conn.commit() cur.close() conn.close() if __name__ == "__main__": #调用函数 main() 一、获取页面源代码 伪装成浏览器;2.进一步包装请求;3.网页请求获取数据;4.解析并保存;5.返回数据。 import urllib.request,urllib.error #指定URL,获取页面数据 #爬取指定url def askUrl(url): #请求头伪装成浏览器(字典) head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3776.400 QQBrowser/10.6.4212.400"} #进一步包装请求 request = urllib.request.Request(url = url,headers=head) #存储页面源代码 html = "" try: #页面请求,获取内容 response = urllib.request.urlopen(request) #读取返回的内容,用"utf-8"编码解析 html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reson"): print(e.reson) #返回页面源代码 return html
最后
以上就是激情花卷最近收集整理的关于Python爬取豆瓣电影前言的全部内容,更多相关Python爬取豆瓣电影前言内容请搜索靠谱客的其他文章。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复